@@ -1570,8 +1570,9 @@ pub unsafe fn write_volatile<T>(dst: *mut T, src: T) {
15701570/// than trying to adapt this to accommodate that change.
15711571///
15721572/// Any questions go to @nagisa.
1573+ // #[cfg(not(bootstrap))] -- Calling this function in a const context from the bootstrap
1574+ // compiler will always cause an error.
15731575#[ lang = "align_offset" ]
1574- #[ cfg( not( bootstrap) ) ]
15751576pub ( crate ) const unsafe fn align_offset < T : Sized > ( p : * const T , a : usize ) -> usize {
15761577 // FIXME(#75598): Direct use of these intrinsics improves codegen significantly at opt-level <=
15771578 // 1, where the method versions of these operations are not inlined.
@@ -1734,165 +1735,6 @@ pub(crate) const unsafe fn align_offset<T: Sized>(p: *const T, a: usize) -> usiz
17341735 usize:: MAX
17351736}
17361737
1737- #[ lang = "align_offset" ]
1738- #[ cfg( bootstrap) ]
1739- pub ( crate ) unsafe fn align_offset < T : Sized > ( p : * const T , a : usize ) -> usize {
1740- // FIXME(#75598): Direct use of these intrinsics improves codegen significantly at opt-level <=
1741- // 1, where the method versions of these operations are not inlined.
1742- use intrinsics:: {
1743- cttz_nonzero, exact_div, unchecked_rem, unchecked_shl, unchecked_shr, unchecked_sub,
1744- wrapping_add, wrapping_mul, wrapping_sub,
1745- } ;
1746-
1747- /// Calculate multiplicative modular inverse of `x` modulo `m`.
1748- ///
1749- /// This implementation is tailored for `align_offset` and has following preconditions:
1750- ///
1751- /// * `m` is a power-of-two;
1752- /// * `x < m`; (if `x ≥ m`, pass in `x % m` instead)
1753- ///
1754- /// Implementation of this function shall not panic. Ever.
1755- #[ inline]
1756- unsafe fn mod_inv ( x : usize , m : usize ) -> usize {
1757- /// Multiplicative modular inverse table modulo 2⁴ = 16.
1758- ///
1759- /// Note, that this table does not contain values where inverse does not exist (i.e., for
1760- /// `0⁻¹ mod 16`, `2⁻¹ mod 16`, etc.)
1761- const INV_TABLE_MOD_16 : [ u8 ; 8 ] = [ 1 , 11 , 13 , 7 , 9 , 3 , 5 , 15 ] ;
1762- /// Modulo for which the `INV_TABLE_MOD_16` is intended.
1763- const INV_TABLE_MOD : usize = 16 ;
1764- /// INV_TABLE_MOD²
1765- const INV_TABLE_MOD_SQUARED : usize = INV_TABLE_MOD * INV_TABLE_MOD ;
1766-
1767- let table_inverse = INV_TABLE_MOD_16 [ ( x & ( INV_TABLE_MOD - 1 ) ) >> 1 ] as usize ;
1768- // SAFETY: `m` is required to be a power-of-two, hence non-zero.
1769- let m_minus_one = unsafe { unchecked_sub ( m, 1 ) } ;
1770- if m <= INV_TABLE_MOD {
1771- table_inverse & m_minus_one
1772- } else {
1773- // We iterate "up" using the following formula:
1774- //
1775- // $$ xy ≡ 1 (mod 2ⁿ) → xy (2 - xy) ≡ 1 (mod 2²ⁿ) $$
1776- //
1777- // until 2²ⁿ ≥ m. Then we can reduce to our desired `m` by taking the result `mod m`.
1778- let mut inverse = table_inverse;
1779- let mut going_mod = INV_TABLE_MOD_SQUARED ;
1780- loop {
1781- // y = y * (2 - xy) mod n
1782- //
1783- // Note, that we use wrapping operations here intentionally – the original formula
1784- // uses e.g., subtraction `mod n`. It is entirely fine to do them `mod
1785- // usize::MAX` instead, because we take the result `mod n` at the end
1786- // anyway.
1787- inverse = wrapping_mul ( inverse, wrapping_sub ( 2usize , wrapping_mul ( x, inverse) ) ) ;
1788- if going_mod >= m {
1789- return inverse & m_minus_one;
1790- }
1791- going_mod = wrapping_mul ( going_mod, going_mod) ;
1792- }
1793- }
1794- }
1795-
1796- let addr = p. addr ( ) ;
1797- let stride = mem:: size_of :: < T > ( ) ;
1798- // SAFETY: `a` is a power-of-two, therefore non-zero.
1799- let a_minus_one = unsafe { unchecked_sub ( a, 1 ) } ;
1800-
1801- if stride == 0 {
1802- // SPECIAL_CASE: handle 0-sized types. No matter how many times we step, the address will
1803- // stay the same, so no offset will be able to align the pointer unless it is already
1804- // aligned. This branch _will_ be optimized out as `stride` is known at compile-time.
1805- let p_mod_a = addr & a_minus_one;
1806- return if p_mod_a == 0 { 0 } else { usize:: MAX } ;
1807- }
1808-
1809- // SAFETY: `stride == 0` case has been handled by the special case above.
1810- let a_mod_stride = unsafe { unchecked_rem ( a, stride) } ;
1811- if a_mod_stride == 0 {
1812- // SPECIAL_CASE: In cases where the `a` is divisible by `stride`, byte offset to align a
1813- // pointer can be computed more simply through `-p (mod a)`. In the off-chance the byte
1814- // offset is not a multiple of `stride`, the input pointer was misaligned and no pointer
1815- // offset will be able to produce a `p` aligned to the specified `a`.
1816- //
1817- // The naive `-p (mod a)` equation inhibits LLVM's ability to select instructions
1818- // like `lea`. We compute `(round_up_to_next_alignment(p, a) - p)` instead. This
1819- // redistributes operations around the load-bearing, but pessimizing `and` instruction
1820- // sufficiently for LLVM to be able to utilize the various optimizations it knows about.
1821- //
1822- // LLVM handles the branch here particularly nicely. If this branch needs to be evaluated
1823- // at runtime, it will produce a mask `if addr_mod_stride == 0 { 0 } else { usize::MAX }`
1824- // in a branch-free way and then bitwise-OR it with whatever result the `-p mod a`
1825- // computation produces.
1826-
1827- // SAFETY: `stride == 0` case has been handled by the special case above.
1828- let addr_mod_stride = unsafe { unchecked_rem ( addr, stride) } ;
1829-
1830- return if addr_mod_stride == 0 {
1831- let aligned_address = wrapping_add ( addr, a_minus_one) & wrapping_sub ( 0 , a) ;
1832- let byte_offset = wrapping_sub ( aligned_address, addr) ;
1833- // SAFETY: `stride` is non-zero. This is guaranteed to divide exactly as well, because
1834- // addr has been verified to be aligned to the original type’s alignment requirements.
1835- unsafe { exact_div ( byte_offset, stride) }
1836- } else {
1837- usize:: MAX
1838- } ;
1839- }
1840-
1841- // GENERAL_CASE: From here on we’re handling the very general case where `addr` may be
1842- // misaligned, there isn’t an obvious relationship between `stride` and `a` that we can take an
1843- // advantage of, etc. This case produces machine code that isn’t particularly high quality,
1844- // compared to the special cases above. The code produced here is still within the realm of
1845- // miracles, given the situations this case has to deal with.
1846-
1847- // SAFETY: a is power-of-two hence non-zero. stride == 0 case is handled above.
1848- let gcdpow = unsafe { cttz_nonzero ( stride) . min ( cttz_nonzero ( a) ) } ;
1849- // SAFETY: gcdpow has an upper-bound that’s at most the number of bits in a usize.
1850- let gcd = unsafe { unchecked_shl ( 1usize , gcdpow) } ;
1851- // SAFETY: gcd is always greater or equal to 1.
1852- if addr & unsafe { unchecked_sub ( gcd, 1 ) } == 0 {
1853- // This branch solves for the following linear congruence equation:
1854- //
1855- // ` p + so = 0 mod a `
1856- //
1857- // `p` here is the pointer value, `s` - stride of `T`, `o` offset in `T`s, and `a` - the
1858- // requested alignment.
1859- //
1860- // With `g = gcd(a, s)`, and the above condition asserting that `p` is also divisible by
1861- // `g`, we can denote `a' = a/g`, `s' = s/g`, `p' = p/g`, then this becomes equivalent to:
1862- //
1863- // ` p' + s'o = 0 mod a' `
1864- // ` o = (a' - (p' mod a')) * (s'^-1 mod a') `
1865- //
1866- // The first term is "the relative alignment of `p` to `a`" (divided by the `g`), the
1867- // second term is "how does incrementing `p` by `s` bytes change the relative alignment of
1868- // `p`" (again divided by `g`). Division by `g` is necessary to make the inverse well
1869- // formed if `a` and `s` are not co-prime.
1870- //
1871- // Furthermore, the result produced by this solution is not "minimal", so it is necessary
1872- // to take the result `o mod lcm(s, a)`. This `lcm(s, a)` is the same as `a'`.
1873-
1874- // SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in
1875- // `a`.
1876- let a2 = unsafe { unchecked_shr ( a, gcdpow) } ;
1877- // SAFETY: `a2` is non-zero. Shifting `a` by `gcdpow` cannot shift out any of the set bits
1878- // in `a` (of which it has exactly one).
1879- let a2minus1 = unsafe { unchecked_sub ( a2, 1 ) } ;
1880- // SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in
1881- // `a`.
1882- let s2 = unsafe { unchecked_shr ( stride & a_minus_one, gcdpow) } ;
1883- // SAFETY: `gcdpow` has an upper-bound not greater than the number of trailing 0-bits in
1884- // `a`. Furthermore, the subtraction cannot overflow, because `a2 = a >> gcdpow` will
1885- // always be strictly greater than `(p % a) >> gcdpow`.
1886- let minusp2 = unsafe { unchecked_sub ( a2, unchecked_shr ( addr & a_minus_one, gcdpow) ) } ;
1887- // SAFETY: `a2` is a power-of-two, as proven above. `s2` is strictly less than `a2`
1888- // because `(s % a) >> gcdpow` is strictly less than `a >> gcdpow`.
1889- return wrapping_mul ( minusp2, unsafe { mod_inv ( s2, a2) } ) & a2minus1;
1890- }
1891-
1892- // Cannot be aligned at all.
1893- usize:: MAX
1894- }
1895-
18961738/// Compares raw pointers for equality.
18971739///
18981740/// This is the same as using the `==` operator, but less generic:
0 commit comments