@@ -1594,11 +1594,10 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
15941594 // FIXME(#75598): Direct use of these intrinsics improves codegen significantly at opt-level <=
15951595 // 1, where the method versions of these operations are not inlined.
15961596 use intrinsics:: {
1597- unchecked_shl, unchecked_shr, unchecked_sub, wrapping_add, wrapping_mul, wrapping_sub,
1597+ cttz_nonzero, exact_div, unchecked_rem, unchecked_shl, unchecked_shr, unchecked_sub,
1598+ wrapping_add, wrapping_mul, wrapping_sub,
15981599 } ;
15991600
1600- let addr = p. addr ( ) ;
1601-
16021601 /// Calculate multiplicative modular inverse of `x` modulo `m`.
16031602 ///
16041603 /// This implementation is tailored for `align_offset` and has following preconditions:
@@ -1648,36 +1647,61 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
16481647 }
16491648 }
16501649
1650+ let addr = p. addr ( ) ;
16511651 let stride = mem:: size_of :: < T > ( ) ;
16521652 // SAFETY: `a` is a power-of-two, therefore non-zero.
16531653 let a_minus_one = unsafe { unchecked_sub ( a, 1 ) } ;
1654- if stride == 1 {
1655- // `stride == 1` case can be computed more simply through `-p (mod a)`, but doing so
1656- // inhibits LLVM's ability to select instructions like `lea`. Instead we compute
1654+
1655+ if stride == 0 {
1656+ // SPECIAL_CASE: handle 0-sized types. No matter how many times we step, the address will
1657+ // stay the same, so no offset will be able to align the pointer unless it is already
1658+ // aligned. This branch _will_ be optimized out as `stride` is known at compile-time.
1659+ let p_mod_a = addr & a_minus_one;
1660+ return if p_mod_a == 0 { 0 } else { usize:: MAX } ;
1661+ }
1662+
1663+ // SAFETY: `stride == 0` case has been handled by the special case above.
1664+ let a_mod_stride = unsafe { unchecked_rem ( a, stride) } ;
1665+ if a_mod_stride == 0 {
1666+ // SPECIAL_CASE: In cases where the `a` is divisible by `stride`, byte offset to align a
1667+ // pointer can be computed more simply through `-p (mod a)`. In the off-chance the byte
1668+ // offset is not a multiple of `stride`, the input pointer was misaligned and no pointer
1669+ // offset will be able to produce a `p` aligned to the specified `a`.
16571670 //
1658- // round_up_to_next_alignment(p, a) - p
1671+ // The naive `-p (mod a)` equation inhibits LLVM's ability to select instructions
1672+ // like `lea`. We compute `(round_up_to_next_alignment(p, a) - p)` instead. This
1673+ // redistributes operations around the load-bearing, but pessimizing `and` instruction
1674+ // sufficiently for LLVM to be able to utilize the various optimizations it knows about.
16591675 //
1660- // which distributes operations around the load-bearing, but pessimizing `and` sufficiently
1661- // for LLVM to be able to utilize the various optimizations it knows about.
1662- return wrapping_sub ( wrapping_add ( addr, a_minus_one) & wrapping_sub ( 0 , a) , addr) ;
1663- }
1676+ // LLVM handles the branch here particularly nicely. If this branch needs to be evaluated
1677+ // at runtime, it will produce a mask `if addr_mod_stride == 0 { 0 } else { usize::MAX }`
1678+ // in a branch-free way and then bitwise-OR it with whatever result the `-p mod a`
1679+ // computation produces.
1680+
1681+ // SAFETY: `stride == 0` case has been handled by the special case above.
1682+ let addr_mod_stride = unsafe { unchecked_rem ( addr, stride) } ;
16641683
1665- let pmoda = addr & a_minus_one;
1666- if pmoda == 0 {
1667- // Already aligned. Yay!
1668- return 0 ;
1669- } else if stride == 0 {
1670- // If the pointer is not aligned, and the element is zero-sized, then no amount of
1671- // elements will ever align the pointer.
1672- return usize:: MAX ;
1684+ return if addr_mod_stride == 0 {
1685+ let aligned_address = wrapping_add ( addr, a_minus_one) & wrapping_sub ( 0 , a) ;
1686+ let byte_offset = wrapping_sub ( aligned_address, addr) ;
1687+ // SAFETY: `stride` is non-zero. This is guaranteed to divide exactly as well, because
1688+ // addr has been verified to be aligned to the original type’s alignment requirements.
1689+ unsafe { exact_div ( byte_offset, stride) }
1690+ } else {
1691+ usize:: MAX
1692+ } ;
16731693 }
16741694
1675- let smoda = stride & a_minus_one;
1695+ // GENERAL_CASE: From here on we’re handling the very general case where `addr` may be
1696+ // misaligned, there isn’t an obvious relationship between `stride` and `a` that we can take an
1697+ // advantage of, etc. This case produces machine code that isn’t particularly high quality,
1698+ // compared to the special cases above. The code produced here is still within the realm of
1699+ // miracles, given the situations this case has to deal with.
1700+
16761701 // SAFETY: a is power-of-two hence non-zero. stride == 0 case is handled above.
1677- let gcdpow = unsafe { intrinsics :: cttz_nonzero ( stride) . min ( intrinsics :: cttz_nonzero ( a) ) } ;
1702+ let gcdpow = unsafe { cttz_nonzero ( stride) . min ( cttz_nonzero ( a) ) } ;
16781703 // SAFETY: gcdpow has an upper-bound that’s at most the number of bits in a usize.
16791704 let gcd = unsafe { unchecked_shl ( 1usize , gcdpow) } ;
1680-
16811705 // SAFETY: gcd is always greater or equal to 1.
16821706 if addr & unsafe { unchecked_sub ( gcd, 1 ) } == 0 {
16831707 // This branch solves for the following linear congruence equation:
@@ -1693,14 +1717,13 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
16931717 // ` p' + s'o = 0 mod a' `
16941718 // ` o = (a' - (p' mod a')) * (s'^-1 mod a') `
16951719 //
1696- // The first term is "the relative alignment of `p` to `a`" (divided by the `g`), the second
1697- // term is "how does incrementing `p` by `s` bytes change the relative alignment of `p`" (again
1698- // divided by `g`).
1699- // Division by `g` is necessary to make the inverse well formed if `a` and `s` are not
1700- // co-prime.
1720+ // The first term is "the relative alignment of `p` to `a`" (divided by the `g`), the
1721+ // second term is "how does incrementing `p` by `s` bytes change the relative alignment of
1722+ // `p`" (again divided by `g`). Division by `g` is necessary to make the inverse well
1723+ // formed if `a` and `s` are not co-prime.
17011724 //
17021725 // Furthermore, the result produced by this solution is not "minimal", so it is necessary
1703- // to take the result `o mod lcm(s, a)`. We can replace `lcm(s, a)` with just a `a'`.
1726+ // to take the result `o mod lcm(s, a)`. This `lcm(s, a)` is the same as `a'`.
17041727
17051728 // SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in
17061729 // `a`.
@@ -1710,11 +1733,11 @@ pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
17101733 let a2minus1 = unsafe { unchecked_sub ( a2, 1 ) } ;
17111734 // SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in
17121735 // `a`.
1713- let s2 = unsafe { unchecked_shr ( smoda , gcdpow) } ;
1736+ let s2 = unsafe { unchecked_shr ( stride & a_minus_one , gcdpow) } ;
17141737 // SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in
17151738 // `a`. Furthermore, the subtraction cannot overflow, because `a2 = a >> gcdpow` will
17161739 // always be strictly greater than `(p % a) >> gcdpow`.
1717- let minusp2 = unsafe { unchecked_sub ( a2, unchecked_shr ( pmoda , gcdpow) ) } ;
1740+ let minusp2 = unsafe { unchecked_sub ( a2, unchecked_shr ( addr & a_minus_one , gcdpow) ) } ;
17181741 // SAFETY: `a2` is a power-of-two, as proven above. `s2` is strictly less than `a2`
17191742 // because `(s % a) >> gcdpow` is strictly less than `a >> gcdpow`.
17201743 return wrapping_mul ( minusp2, unsafe { mod_inv ( s2, a2) } ) & a2minus1;
0 commit comments