Skip to content

Commit db9f604

Browse files
authored
Merge pull request #1929 from sayantn/non-temporal
Fixes for non-temporal intrinsics
2 parents 53e8661 + 0a24694 commit db9f604

File tree

6 files changed

+39
-6
lines changed

6 files changed

+39
-6
lines changed

crates/core_arch/src/x86/avx.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1833,6 +1833,7 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
18331833
#[cfg_attr(test, assert_instr(vmovntdq))]
18341834
#[stable(feature = "simd_x86", since = "1.27.0")]
18351835
pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
1836+
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
18361837
crate::arch::asm!(
18371838
vps!("vmovntdq", ",{a}"),
18381839
p = in(reg) mem_addr,
@@ -1861,6 +1862,7 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i) {
18611862
#[stable(feature = "simd_x86", since = "1.27.0")]
18621863
#[allow(clippy::cast_ptr_alignment)]
18631864
pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
1865+
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
18641866
crate::arch::asm!(
18651867
vps!("vmovntpd", ",{a}"),
18661868
p = in(reg) mem_addr,
@@ -1890,6 +1892,7 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d) {
18901892
#[stable(feature = "simd_x86", since = "1.27.0")]
18911893
#[allow(clippy::cast_ptr_alignment)]
18921894
pub unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256) {
1895+
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
18931896
crate::arch::asm!(
18941897
vps!("vmovntps", ",{a}"),
18951898
p = in(reg) mem_addr,
@@ -4291,6 +4294,7 @@ mod tests {
42914294
let a = _mm256_setr_epi64x(1, 2, 3, 4);
42924295
let mut r = _mm256_undefined_si256();
42934296
_mm256_stream_si256(ptr::addr_of_mut!(r), a);
4297+
_mm_sfence();
42944298
assert_eq_m256i(r, a);
42954299
}
42964300

@@ -4305,6 +4309,7 @@ mod tests {
43054309
let mut mem = Memory { data: [-1.0; 4] };
43064310

43074311
_mm256_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4312+
_mm_sfence();
43084313
for i in 0..4 {
43094314
assert_eq!(mem.data[i], get_m256d(a, i));
43104315
}
@@ -4321,6 +4326,7 @@ mod tests {
43214326
let mut mem = Memory { data: [-1.0; 8] };
43224327

43234328
_mm256_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
4329+
_mm_sfence();
43244330
for i in 0..8 {
43254331
assert_eq!(mem.data[i], get_m256(a, i));
43264332
}

crates/core_arch/src/x86/avx512f.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29593,6 +29593,7 @@ pub fn _mm_mask_testn_epi64_mask(k: __mmask8, a: __m128i, b: __m128i) -> __mmask
2959329593
#[cfg_attr(test, assert_instr(vmovntps))]
2959429594
#[allow(clippy::cast_ptr_alignment)]
2959529595
pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
29596+
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
2959629597
crate::arch::asm!(
2959729598
vps!("vmovntps", ",{a}"),
2959829599
p = in(reg) mem_addr,
@@ -29619,6 +29620,7 @@ pub unsafe fn _mm512_stream_ps(mem_addr: *mut f32, a: __m512) {
2961929620
#[cfg_attr(test, assert_instr(vmovntpd))]
2962029621
#[allow(clippy::cast_ptr_alignment)]
2962129622
pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
29623+
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
2962229624
crate::arch::asm!(
2962329625
vps!("vmovntpd", ",{a}"),
2962429626
p = in(reg) mem_addr,
@@ -29645,6 +29647,7 @@ pub unsafe fn _mm512_stream_pd(mem_addr: *mut f64, a: __m512d) {
2964529647
#[cfg_attr(test, assert_instr(vmovntdq))]
2964629648
#[allow(clippy::cast_ptr_alignment)]
2964729649
pub unsafe fn _mm512_stream_si512(mem_addr: *mut __m512i, a: __m512i) {
29650+
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
2964829651
crate::arch::asm!(
2964929652
vps!("vmovntdq", ",{a}"),
2965029653
p = in(reg) mem_addr,
@@ -56328,6 +56331,7 @@ mod tests {
5632856331
let mut mem = Memory { data: [-1.0; 16] };
5632956332

5633056333
_mm512_stream_ps(&mut mem.data[0] as *mut f32, a);
56334+
_mm_sfence();
5633156335
for i in 0..16 {
5633256336
assert_eq!(mem.data[i], get_m512(a, i));
5633356337
}
@@ -56344,6 +56348,7 @@ mod tests {
5634456348
let mut mem = Memory { data: [-1.0; 8] };
5634556349

5634656350
_mm512_stream_pd(&mut mem.data[0] as *mut f64, a);
56351+
_mm_sfence();
5634756352
for i in 0..8 {
5634856353
assert_eq!(mem.data[i], get_m512d(a, i));
5634956354
}
@@ -56360,6 +56365,7 @@ mod tests {
5636056365
let mut mem = Memory { data: [-1; 8] };
5636156366

5636256367
_mm512_stream_si512(mem.data.as_mut_ptr().cast(), a);
56368+
_mm_sfence();
5636356369
for i in 0..8 {
5636456370
assert_eq!(mem.data[i], get_m512i(a, i));
5636556371
}

crates/core_arch/src/x86/sse.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2022,6 +2022,7 @@ unsafe extern "C" {
20222022
#[stable(feature = "simd_x86", since = "1.27.0")]
20232023
#[allow(clippy::cast_ptr_alignment)]
20242024
pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
2025+
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
20252026
crate::arch::asm!(
20262027
vps!("movntps", ",{a}"),
20272028
p = in(reg) mem_addr,
@@ -3329,6 +3330,7 @@ mod tests {
33293330
let mut mem = Memory { data: [-1.0; 4] };
33303331

33313332
_mm_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
3333+
_mm_sfence();
33323334
for i in 0..4 {
33333335
assert_eq!(mem.data[i], get_m128(a, i));
33343336
}

crates/core_arch/src/x86/sse2.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1363,6 +1363,7 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
13631363
#[cfg_attr(test, assert_instr(movntdq))]
13641364
#[stable(feature = "simd_x86", since = "1.27.0")]
13651365
pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
1366+
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
13661367
crate::arch::asm!(
13671368
vps!("movntdq", ",{a}"),
13681369
p = in(reg) mem_addr,
@@ -1390,6 +1391,7 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
13901391
#[cfg_attr(test, assert_instr(movnti))]
13911392
#[stable(feature = "simd_x86", since = "1.27.0")]
13921393
pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
1394+
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
13931395
crate::arch::asm!(
13941396
vps!("movnti", ",{a:e}"), // `:e` for 32bit value
13951397
p = in(reg) mem_addr,
@@ -2627,6 +2629,7 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
26272629
#[stable(feature = "simd_x86", since = "1.27.0")]
26282630
#[allow(clippy::cast_ptr_alignment)]
26292631
pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
2632+
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
26302633
crate::arch::asm!(
26312634
vps!("movntpd", ",{a}"),
26322635
p = in(reg) mem_addr,
@@ -4070,6 +4073,7 @@ mod tests {
40704073
);
40714074
let mut r = _mm_set1_epi8(0);
40724075
_mm_maskmoveu_si128(a, mask, ptr::addr_of_mut!(r) as *mut i8);
4076+
_mm_sfence();
40734077
let e = _mm_set_epi8(0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
40744078
assert_eq_m128i(r, e);
40754079
}
@@ -4106,6 +4110,7 @@ mod tests {
41064110
let a = _mm_setr_epi32(1, 2, 3, 4);
41074111
let mut r = _mm_undefined_si128();
41084112
_mm_stream_si128(ptr::addr_of_mut!(r), a);
4113+
_mm_sfence();
41094114
assert_eq_m128i(r, a);
41104115
}
41114116

@@ -4117,6 +4122,7 @@ mod tests {
41174122
let a: i32 = 7;
41184123
let mut mem = boxed::Box::<i32>::new(-1);
41194124
_mm_stream_si32(ptr::addr_of_mut!(*mem), a);
4125+
_mm_sfence();
41204126
assert_eq!(a, *mem);
41214127
}
41224128

@@ -4813,6 +4819,7 @@ mod tests {
48134819
let mut mem = Memory { data: [-1.0; 2] };
48144820

48154821
_mm_stream_pd(ptr::addr_of_mut!(mem.data[0]), a);
4822+
_mm_sfence();
48164823
for i in 0..2 {
48174824
assert_eq!(mem.data[i], get_m128d(a, i));
48184825
}

crates/core_arch/src/x86/sse4a.rs

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,6 @@ unsafe extern "C" {
1515
fn insertq(x: i64x2, y: i64x2) -> i64x2;
1616
#[link_name = "llvm.x86.sse4a.insertqi"]
1717
fn insertqi(x: i64x2, y: i64x2, len: u8, idx: u8) -> i64x2;
18-
#[link_name = "llvm.x86.sse4a.movnt.sd"]
19-
fn movntsd(x: *mut f64, y: __m128d);
20-
#[link_name = "llvm.x86.sse4a.movnt.ss"]
21-
fn movntss(x: *mut f32, y: __m128);
2218
}
2319

2420
/// Extracts the bit range specified by `y` from the lower 64 bits of `x`.
@@ -114,7 +110,13 @@ pub fn _mm_inserti_si64<const LEN: i32, const IDX: i32>(x: __m128i, y: __m128i)
114110
#[cfg_attr(test, assert_instr(movntsd))]
115111
#[stable(feature = "simd_x86", since = "1.27.0")]
116112
pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
117-
movntsd(p, a);
113+
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
114+
crate::arch::asm!(
115+
vps!("movntsd", ",{a}"),
116+
p = in(reg) p,
117+
a = in(xmm_reg) a,
118+
options(nostack, preserves_flags),
119+
);
118120
}
119121

120122
/// Non-temporal store of `a.0` into `p`.
@@ -134,7 +136,13 @@ pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
134136
#[cfg_attr(test, assert_instr(movntss))]
135137
#[stable(feature = "simd_x86", since = "1.27.0")]
136138
pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) {
137-
movntss(p, a);
139+
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
140+
crate::arch::asm!(
141+
vps!("movntss", ",{a}"),
142+
p = in(reg) p,
143+
a = in(xmm_reg) a,
144+
options(nostack, preserves_flags),
145+
);
138146
}
139147

140148
#[cfg(test)]
@@ -209,6 +217,7 @@ mod tests {
209217
let x = _mm_setr_pd(3.0, 4.0);
210218

211219
_mm_stream_sd(d, x);
220+
_mm_sfence();
212221
}
213222
assert_eq!(mem.data[0], 3.0);
214223
assert_eq!(mem.data[1], 2.0);
@@ -234,6 +243,7 @@ mod tests {
234243
let x = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
235244

236245
_mm_stream_ss(d, x);
246+
_mm_sfence();
237247
}
238248
assert_eq!(mem.data[0], 5.0);
239249
assert_eq!(mem.data[1], 2.0);

crates/core_arch/src/x86_64/sse2.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ pub fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
7878
#[cfg_attr(test, assert_instr(movnti))]
7979
#[stable(feature = "simd_x86", since = "1.27.0")]
8080
pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
81+
// see #1541, we should use inline asm to be sure, because LangRef isn't clear enough
8182
crate::arch::asm!(
8283
vps!("movnti", ",{a}"),
8384
p = in(reg) mem_addr,
@@ -200,6 +201,7 @@ mod tests {
200201
let a: i64 = 7;
201202
let mut mem = boxed::Box::<i64>::new(-1);
202203
_mm_stream_si64(ptr::addr_of_mut!(*mem), a);
204+
_mm_sfence();
203205
assert_eq!(a, *mem);
204206
}
205207

0 commit comments

Comments
 (0)