Apply suggestions from code review

lukas-code · RalfJung · Lukas Markeffsky · commit 3fd95542e273 · 2022-10-20T21:08:21.000+02:00
* Remove `const_align_offset` and just call `align_offset` again
* Remove miri shim for `div_exact`
* Add more comments

Co-authored-by: Ralf Jung &lt;post@ralfj.de&gt;
diff --git a/compiler/rustc_const_eval/src/const_eval/machine.rs b/compiler/rustc_const_eval/src/const_eval/machine.rs
@@ -1,11 +1,12 @@
 use rustc_hir::def::DefKind;
 use rustc_middle::mir;
 use rustc_middle::mir::interpret::PointerArithmetic;
-use rustc_middle::ty::layout::{FnAbiOf, LayoutOf};
+use rustc_middle::ty::layout::FnAbiOf;
 use rustc_middle::ty::{self, Ty, TyCtxt};
 use std::borrow::Borrow;
 use std::collections::hash_map::Entry;
 use std::hash::Hash;
+use std::ops::ControlFlow;
 
 use rustc_data_structures::fx::FxHashMap;
 use std::fmt;
@@ -147,9 +148,10 @@ impl interpret::MayLeak for ! {
 }
 
 impl<'mir, 'tcx: 'mir> CompileTimeEvalContext<'mir, 'tcx> {
-    /// "Intercept" a function call to a panic-related function
-    /// because we have something special to do for it.
-    /// If this returns successfully (`Ok`), the function should just be evaluated normally.
+    /// "Intercept" a function call, because we have something special to do for it.
+    /// All `#[rustc_do_not_const_check]` functions should be hooked here.
+    /// If this returns `Some`, then evaluation should continue with that function.
+    /// Otherwise, the function call has been handled and the function has returned.
     fn hook_special_const_fn(
         &mut self,
         instance: ty::Instance<'tcx>,
@@ -158,7 +160,6 @@ impl<'mir, 'tcx: 'mir> CompileTimeEvalContext<'mir, 'tcx> {
         dest: &PlaceTy<'tcx>,
         ret: Option<mir::BasicBlock>,
     ) -> InterpResult<'tcx, Option<ty::Instance<'tcx>>> {
-        // All `#[rustc_do_not_const_check]` functions should be hooked here.
         let def_id = instance.def_id();
 
         if Some(def_id) == self.tcx.lang_items().panic_display()
@@ -192,34 +193,27 @@ impl<'mir, 'tcx: 'mir> CompileTimeEvalContext<'mir, 'tcx> {
 
             return Ok(Some(new_instance));
         } else if Some(def_id) == self.tcx.lang_items().align_offset_fn() {
-            // For align_offset, we either call const_align_offset or return usize::MAX directly.
-
-            let Some(const_def_id) = self.tcx.lang_items().const_align_offset_fn() else {
-                bug!("`const_align_offset` must be defined to call `align_offset` in const eval")
-            };
-            let const_instance = ty::Instance::resolve(
-                *self.tcx,
-                ty::ParamEnv::reveal_all(),
-                const_def_id,
-                instance.substs,
-            )
-            .unwrap()
-            .unwrap();
-
-            self.align_offset(const_instance, args, dest, ret)?;
-
-            return Ok(None);
+            // For align_offset, we replace the function call if the pointer has no address.
+            match self.align_offset(instance, args, dest, ret)? {
+                ControlFlow::Continue(()) => return Ok(Some(instance)),
+                ControlFlow::Break(()) => return Ok(None),
+            }
         }
         Ok(Some(instance))
     }
 
+    /// `align_offset(ptr, target_align)` needs special handling in const eval, because the pointer
+    /// may not have an address.
+    ///
+    /// If the pointer does have a known address we return `CONTINUE` and the function call should
+    /// proceed as normal. Otherwise we will replace the function call and return `BREAK`.
     fn align_offset(
         &mut self,
-        const_instance: ty::Instance<'tcx>,
+        instance: ty::Instance<'tcx>,
         args: &[OpTy<'tcx>],
         dest: &PlaceTy<'tcx>,
         ret: Option<mir::BasicBlock>,
-    ) -> InterpResult<'tcx> {
+    ) -> InterpResult<'tcx, ControlFlow<()>> {
         assert_eq!(args.len(), 2);
 
         let ptr = self.read_pointer(&args[0])?;
@@ -229,36 +223,40 @@ impl<'mir, 'tcx: 'mir> CompileTimeEvalContext<'mir, 'tcx> {
             throw_ub_format!("`align_offset` called with non-power-of-two align: {}", target_align);
         }
 
-        let addr = match self.ptr_try_get_alloc_id(ptr) {
+        match self.ptr_try_get_alloc_id(ptr) {
             Ok((alloc_id, offset, _extra)) => {
                 let (_size, alloc_align, _kind) = self.get_alloc_info(alloc_id);
 
-                if target_align > alloc_align.bytes() {
+                if target_align <= alloc_align.bytes() {
+                    // Extract the address relative to the allocation base that is definitely
+                    // sufficiently aligned and call `align_offset` again.
+                    let addr = ImmTy::from_uint(offset.bytes(), args[0].layout).into();
+                    let align = ImmTy::from_uint(target_align, args[1].layout).into();
+
+                    let fn_abi = self.fn_abi_of_instance(instance, ty::List::empty())?;
+                    self.eval_fn_call(
+                        FnVal::Instance(instance),
+                        (CallAbi::Rust, fn_abi),
+                        &[addr, align],
+                        false,
+                        dest,
+                        ret,
+                        StackPopUnwind::NotAllowed,
+                    )?;
+                    Ok(ControlFlow::BREAK)
+                } else {
+                    // Not alignable in const, return `usize::MAX`.
                     let usize_max = Scalar::from_machine_usize(self.machine_usize_max(), self);
                     self.write_scalar(usize_max, dest)?;
                     self.return_to_block(ret)?;
-                    return Ok(());
-                } else {
-                    offset.bytes()
+                    Ok(ControlFlow::BREAK)
                 }
             }
-            Err(addr) => addr,
-        };
-
-        let usize_layout = self.layout_of(self.tcx.types.usize)?;
-        let addr = ImmTy::from_uint(addr, usize_layout).into();
-        let align = ImmTy::from_uint(target_align, usize_layout).into();
-
-        let fn_abi = self.fn_abi_of_instance(const_instance, ty::List::empty())?;
-        self.eval_fn_call(
-            FnVal::Instance(const_instance),
-            (CallAbi::Rust, fn_abi),
-            &[addr, align],
-            false,
-            dest,
-            ret,
-            StackPopUnwind::NotAllowed,
-        )
+            Err(_addr) => {
+                // The pointer has an address, continue with function call.
+                Ok(ControlFlow::CONTINUE)
+            }
+        }
     }
 
     /// See documentation on the `ptr_guaranteed_cmp` intrinsic.
diff --git a/compiler/rustc_hir/src/lang_items.rs b/compiler/rustc_hir/src/lang_items.rs
@@ -283,8 +283,7 @@ language_item_table! {
     MaybeUninit,             sym::maybe_uninit,        maybe_uninit,               Target::Union,          GenericRequirement::None;
 
     /// Align offset for stride != 1; must not panic.
-    AlignOffset,             sym::align_offset,        align_offset_fn,            Target::Fn,             GenericRequirement::Exact(1);
-    ConstAlignOffset,        sym::const_align_offset,  const_align_offset_fn,      Target::Fn,             GenericRequirement::Exact(1);
+    AlignOffset,             sym::align_offset,        align_offset_fn,            Target::Fn,             GenericRequirement::None;
 
     Termination,             sym::termination,         termination,                Target::Trait,          GenericRequirement::None;
 
diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs
@@ -510,7 +510,6 @@ symbols! {
         concat_macro,
         conservative_impl_trait,
         console,
-        const_align_offset,
         const_allocate,
         const_async_blocks,
         const_compare_raw_pointers,
diff --git a/library/core/src/ptr/mod.rs b/library/core/src/ptr/mod.rs
@@ -1559,41 +1559,20 @@ pub unsafe fn write_volatile<T>(dst: *mut T, src: T) {
 ///
 /// # Safety
 /// `a` must be a power of two.
-#[lang = "align_offset"]
-#[rustc_do_not_const_check]
-#[cfg(not(bootstrap))]
-pub(crate) const unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
-    // SAFETY: Caller ensures that `a` is a power of two.
-    unsafe { const_align_offset::<T>(p.addr(), a) }
-}
-
-#[lang = "align_offset"]
-#[cfg(bootstrap)]
-pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
-    // SAFETY: Caller ensures that `a` is a power of two.
-    unsafe { const_align_offset::<T>(p.addr(), a) }
-}
-
-/// Align address `addr`.
-///
-/// Calculate offset (in terms of elements of `size_of::<T>()` stride) that has to be applied
-/// to address `addr` so that `addr` would get aligned to `a`.
 ///
-/// Note: This implementation has been carefully tailored to not panic. It is UB for this to panic.
+/// # Notes
+/// This implementation has been carefully tailored to not panic. It is UB for this to panic.
 /// The only real change that can be made here is change of `INV_TABLE_MOD_16` and associated
 /// constants.
 ///
-/// # Safety
-/// `a` must be a power of two.
-///
 /// If we ever decide to make it possible to call the intrinsic with `a` that is not a
 /// power-of-two, it will probably be more prudent to just change to a naive implementation rather
 /// than trying to adapt this to accommodate that change.
 ///
 /// Any questions go to @nagisa.
-#[cfg_attr(not(bootstrap), lang = "const_align_offset")]
-#[rustc_allow_const_fn_unstable(const_exact_div)]
-pub(crate) const unsafe fn const_align_offset<T: Sized>(addr: usize, a: usize) -> usize {
+#[lang = "align_offset"]
+#[cfg(not(bootstrap))]
+pub(crate) const unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
     // FIXME(#75598): Direct use of these intrinsics improves codegen significantly at opt-level <=
     // 1, where the method versions of these operations are not inlined.
     use intrinsics::{
@@ -1650,6 +1629,171 @@ pub(crate) const unsafe fn const_align_offset<T: Sized>(addr: usize, a: usize) -
         }
     }
 
+    let stride = mem::size_of::<T>();
+
+    // SAFETY: At runtime transmuting a pointer to `usize` is always safe, because they have the
+    // same layout. During const eval we hook this function to ensure that the pointer always has
+    // an address (only the standard library can do this).
+    let addr = unsafe { mem::transmute(p) };
+
+    // SAFETY: `a` is a power-of-two, therefore non-zero.
+    let a_minus_one = unsafe { unchecked_sub(a, 1) };
+
+    if stride == 0 {
+        // SPECIAL_CASE: handle 0-sized types. No matter how many times we step, the address will
+        // stay the same, so no offset will be able to align the pointer unless it is already
+        // aligned. This branch _will_ be optimized out as `stride` is known at compile-time.
+        let p_mod_a = addr & a_minus_one;
+        return if p_mod_a == 0 { 0 } else { usize::MAX };
+    }
+
+    // SAFETY: `stride == 0` case has been handled by the special case above.
+    let a_mod_stride = unsafe { unchecked_rem(a, stride) };
+    if a_mod_stride == 0 {
+        // SPECIAL_CASE: In cases where the `a` is divisible by `stride`, byte offset to align a
+        // pointer can be computed more simply through `-p (mod a)`. In the off-chance the byte
+        // offset is not a multiple of `stride`, the input pointer was misaligned and no pointer
+        // offset will be able to produce a `p` aligned to the specified `a`.
+        //
+        // The naive `-p (mod a)` equation  inhibits LLVM's ability to select instructions
+        // like `lea`. We compute `(round_up_to_next_alignment(p, a) - p)` instead. This
+        // redistributes operations around the load-bearing, but pessimizing `and` instruction
+        // sufficiently for LLVM to be able to utilize the various optimizations it knows about.
+        //
+        // LLVM handles the branch here particularly nicely. If this branch needs to be evaluated
+        // at runtime, it will produce a mask `if addr_mod_stride == 0 { 0 } else { usize::MAX }`
+        // in a branch-free way and then bitwise-OR it with whatever result the `-p mod a`
+        // computation produces.
+
+        // SAFETY: `stride == 0` case has been handled by the special case above.
+        let addr_mod_stride = unsafe { unchecked_rem(addr, stride) };
+
+        return if addr_mod_stride == 0 {
+            let aligned_address = wrapping_add(addr, a_minus_one) & wrapping_sub(0, a);
+            let byte_offset = wrapping_sub(aligned_address, addr);
+            // SAFETY: `stride` is non-zero. This is guaranteed to divide exactly as well, because
+            // addr has been verified to be aligned to the original type’s alignment requirements.
+            unsafe { exact_div(byte_offset, stride) }
+        } else {
+            usize::MAX
+        };
+    }
+
+    // GENERAL_CASE: From here on we’re handling the very general case where `addr` may be
+    // misaligned, there isn’t an obvious relationship between `stride` and `a` that we can take an
+    // advantage of, etc. This case produces machine code that isn’t particularly high quality,
+    // compared to the special cases above. The code produced here is still within the realm of
+    // miracles, given the situations this case has to deal with.
+
+    // SAFETY: a is power-of-two hence non-zero. stride == 0 case is handled above.
+    let gcdpow = unsafe { cttz_nonzero(stride).min(cttz_nonzero(a)) };
+    // SAFETY: gcdpow has an upper-bound that’s at most the number of bits in a usize.
+    let gcd = unsafe { unchecked_shl(1usize, gcdpow) };
+    // SAFETY: gcd is always greater or equal to 1.
+    if addr & unsafe { unchecked_sub(gcd, 1) } == 0 {
+        // This branch solves for the following linear congruence equation:
+        //
+        // ` p + so = 0 mod a `
+        //
+        // `p` here is the pointer value, `s` - stride of `T`, `o` offset in `T`s, and `a` - the
+        // requested alignment.
+        //
+        // With `g = gcd(a, s)`, and the above condition asserting that `p` is also divisible by
+        // `g`, we can denote `a' = a/g`, `s' = s/g`, `p' = p/g`, then this becomes equivalent to:
+        //
+        // ` p' + s'o = 0 mod a' `
+        // ` o = (a' - (p' mod a')) * (s'^-1 mod a') `
+        //
+        // The first term is "the relative alignment of `p` to `a`" (divided by the `g`), the
+        // second term is "how does incrementing `p` by `s` bytes change the relative alignment of
+        // `p`" (again divided by `g`). Division by `g` is necessary to make the inverse well
+        // formed if `a` and `s` are not co-prime.
+        //
+        // Furthermore, the result produced by this solution is not "minimal", so it is necessary
+        // to take the result `o mod lcm(s, a)`. This `lcm(s, a)` is the same as `a'`.
+
+        // SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in
+        // `a`.
+        let a2 = unsafe { unchecked_shr(a, gcdpow) };
+        // SAFETY: `a2` is non-zero. Shifting `a` by `gcdpow` cannot shift out any of the set bits
+        // in `a` (of which it has exactly one).
+        let a2minus1 = unsafe { unchecked_sub(a2, 1) };
+        // SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in
+        // `a`.
+        let s2 = unsafe { unchecked_shr(stride & a_minus_one, gcdpow) };
+        // SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in
+        // `a`. Furthermore, the subtraction cannot overflow, because `a2 = a >> gcdpow` will
+        // always be strictly greater than `(p % a) >> gcdpow`.
+        let minusp2 = unsafe { unchecked_sub(a2, unchecked_shr(addr & a_minus_one, gcdpow)) };
+        // SAFETY: `a2` is a power-of-two, as proven above. `s2` is strictly less than `a2`
+        // because `(s % a) >> gcdpow` is strictly less than `a >> gcdpow`.
+        return wrapping_mul(minusp2, unsafe { mod_inv(s2, a2) }) & a2minus1;
+    }
+
+    // Cannot be aligned at all.
+    usize::MAX
+}
+
+#[lang = "align_offset"]
+#[cfg(bootstrap)]
+pub(crate) unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usize {
+    // FIXME(#75598): Direct use of these intrinsics improves codegen significantly at opt-level <=
+    // 1, where the method versions of these operations are not inlined.
+    use intrinsics::{
+        cttz_nonzero, exact_div, unchecked_rem, unchecked_shl, unchecked_shr, unchecked_sub,
+        wrapping_add, wrapping_mul, wrapping_sub,
+    };
+
+    /// Calculate multiplicative modular inverse of `x` modulo `m`.
+    ///
+    /// This implementation is tailored for `align_offset` and has following preconditions:
+    ///
+    /// * `m` is a power-of-two;
+    /// * `x < m`; (if `x ≥ m`, pass in `x % m` instead)
+    ///
+    /// Implementation of this function shall not panic. Ever.
+    #[inline]
+    unsafe fn mod_inv(x: usize, m: usize) -> usize {
+        /// Multiplicative modular inverse table modulo 2⁴ = 16.
+        ///
+        /// Note, that this table does not contain values where inverse does not exist (i.e., for
+        /// `0⁻¹ mod 16`, `2⁻¹ mod 16`, etc.)
+        const INV_TABLE_MOD_16: [u8; 8] = [1, 11, 13, 7, 9, 3, 5, 15];
+        /// Modulo for which the `INV_TABLE_MOD_16` is intended.
+        const INV_TABLE_MOD: usize = 16;
+        /// INV_TABLE_MOD²
+        const INV_TABLE_MOD_SQUARED: usize = INV_TABLE_MOD * INV_TABLE_MOD;
+
+        let table_inverse = INV_TABLE_MOD_16[(x & (INV_TABLE_MOD - 1)) >> 1] as usize;
+        // SAFETY: `m` is required to be a power-of-two, hence non-zero.
+        let m_minus_one = unsafe { unchecked_sub(m, 1) };
+        if m <= INV_TABLE_MOD {
+            table_inverse & m_minus_one
+        } else {
+            // We iterate "up" using the following formula:
+            //
+            // $$ xy ≡ 1 (mod 2ⁿ) → xy (2 - xy) ≡ 1 (mod 2²ⁿ) $$
+            //
+            // until 2²ⁿ ≥ m. Then we can reduce to our desired `m` by taking the result `mod m`.
+            let mut inverse = table_inverse;
+            let mut going_mod = INV_TABLE_MOD_SQUARED;
+            loop {
+                // y = y * (2 - xy) mod n
+                //
+                // Note, that we use wrapping operations here intentionally – the original formula
+                // uses e.g., subtraction `mod n`. It is entirely fine to do them `mod
+                // usize::MAX` instead, because we take the result `mod n` at the end
+                // anyway.
+                inverse = wrapping_mul(inverse, wrapping_sub(2usize, wrapping_mul(x, inverse)));
+                if going_mod >= m {
+                    return inverse & m_minus_one;
+                }
+                going_mod = wrapping_mul(going_mod, going_mod);
+            }
+        }
+    }
+
+    let addr = p.addr();
     let stride = mem::size_of::<T>();
     // SAFETY: `a` is a power-of-two, therefore non-zero.
     let a_minus_one = unsafe { unchecked_sub(a, 1) };
diff --git a/src/tools/miri/src/shims/intrinsics/mod.rs b/src/tools/miri/src/shims/intrinsics/mod.rs
@@ -357,11 +357,6 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
             }
 
             // Other
-            "exact_div" => {
-                let [num, denom] = check_arg_count(args)?;
-                this.exact_div(&this.read_immediate(num)?, &this.read_immediate(denom)?, dest)?;
-            }
-
             "breakpoint" => {
                 let [] = check_arg_count(args)?;
                 // normally this would raise a SIGTRAP, which aborts if no debugger is connected