@@ -2,7 +2,7 @@ use std::collections::hash_map::Entry;
22use std:: marker:: PhantomData ;
33use std:: ops:: Range ;
44
5- use rustc_abi:: { BackendRepr , FieldIdx , FieldsShape , Size , VariantIdx } ;
5+ use rustc_abi:: { BackendRepr , FieldIdx , FieldsShape , ScalableElt , Size , VariantIdx } ;
66use rustc_data_structures:: fx:: FxHashMap ;
77use rustc_index:: IndexVec ;
88use rustc_middle:: middle:: codegen_fn_attrs:: CodegenFnAttrFlags ;
@@ -361,6 +361,49 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
361361 return ;
362362 }
363363
364+ // Don't spill `<vscale x N x i1>` for `N != 16`:
365+ //
366+ // SVE predicates are only one bit for each byte in an SVE vector (which makes
367+ // sense, the predicate only needs to keep track of whether a lane is
368+ // enabled/disabled). i.e. a `<vscale x 16 x i8>` vector has a `<vscale x 16 x i1>`
369+ // predicate type. `<vscale x 16 x i1>` corresponds to two bytes of storage,
370+ // multiplied by the `vscale`, with one bit for each of the sixteen lanes.
371+ //
372+ // For a vector with fewer elements, such as `svint32_t`/`<vscale x 4 x i32>`,
373+ // while only a `<vscale x 4 x i1>` predicate type would be strictly necessary,
374+ // relevant intrinsics still take a `svbool_t`/`<vscale x 16 x i1>` - this is
375+ // because a `<vscale x 4 x i1>` is only half of a byte (for `vscale=1`), and with
376+ // memory being byte-addressable, it's unclear how to store that.
377+ //
378+ // Due to this, LLVM ultimately decided not to support stores of `<vscale x N x i1>`
379+ // for `N != 16`. As for `vscale=1` and `N` fewer than sixteen, partial bytes would
380+ // need to be stored (except for `N=8`, but that also isn't supported). `N` can
381+ // never be greater than sixteen as that ends up larger than the 128-bit increment
382+ // size.
383+ //
384+ // Internally, with an intrinsic operating on a `svint32_t`/`<vscale x 4 x i32>`
385+ // (for example), the intrinsic takes the `svbool_t`/`<vscale x 16 x i1>` predicate
386+ // and casts it to a `svbool4_t`/`<vscale x 4 x i1>`. Therefore, it's important that
387+ // the `<vscale x 4 x i32>` never spills because that'll cause errors during
388+ // instruction selection. Spilling to the stack to create debuginfo for these
389+ // intermediate values must be avoided and won't degrade the debugging experience
390+ // anyway.
391+ if operand. layout . ty . is_scalable_vector ( )
392+ && bx. sess ( ) . target . arch == "aarch64"
393+ && let ty:: Adt ( adt, args) = & operand. layout . ty . kind ( )
394+ && let Some ( marker_type_field) =
395+ adt. non_enum_variant ( ) . fields . get ( FieldIdx :: from_u32 ( 0 ) )
396+ {
397+ let marker_type = marker_type_field. ty ( bx. tcx ( ) , args) ;
398+ // i.e. `<vscale x N x i1>` when `N != 16`
399+ if let ty:: Slice ( element_ty) = marker_type. kind ( )
400+ && element_ty. is_bool ( )
401+ && adt. repr ( ) . scalable != Some ( ScalableElt :: ElementCount ( 16 ) )
402+ {
403+ return ;
404+ }
405+ }
406+
364407 Self :: spill_operand_to_stack ( * operand, name, bx)
365408 }
366409
0 commit comments