From 88687a91e459c548ccd54d71c20ef8db9451db7e Mon Sep 17 00:00:00 2001 From: Coreforge Date: Tue, 7 Nov 2023 21:30:44 +0100 Subject: [PATCH 1/8] memory access fixes/workarounds for the pi5 --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 6 +++--- drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 20 +++++++++++--------- 7 files changed, 20 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9c99d69b4b083e..43b5da589e437c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1048,7 +1048,7 @@ static int amdgpu_device_wb_init(struct amdgpu_device *adev) memset(&adev->wb.used, 0, sizeof(adev->wb.used)); /* clear wb memory */ - memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); + memset_io((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 0ca51df46cc0d3..bca52f208a29a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -364,7 +364,7 @@ int amdgpu_gfx_kiq_init(struct amdgpu_device *adev, return r; } - memset(hpd, 0, hpd_size); + memset_io(hpd, 0, hpd_size); r = amdgpu_bo_reserve(kiq->eop_obj, true); if (unlikely(r != 0)) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c index 10df731998b22f..2627963b2c0d4a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sa.c @@ -58,7 +58,7 @@ int amdgpu_sa_bo_manager_init(struct amdgpu_device *adev, return r; } - memset(sa_manager->cpu_ptr, 0, size); + memset_io(sa_manager->cpu_ptr, 0, size); drm_suballoc_manager_init(&sa_manager->base, size, suballoc_align); return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 8c3fb1562ffef9..0fca9771e25a42 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1101,7 +1101,7 @@ static struct ttm_tt *amdgpu_ttm_tt_create(struct ttm_buffer_object *bo, if (abo->flags & AMDGPU_GEM_CREATE_CPU_GTT_USWC) caching = ttm_write_combined; else - caching = ttm_cached; + caching = ttm_uncached; /* allocate space for the uninitialized page entries */ if (ttm_sg_tt_init(>t->ttm, bo, page_flags, caching)) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c index bef7541770641c..21e4dc87588eee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c @@ -962,7 +962,7 @@ static int amdgpu_ucode_init_single_fw(struct amdgpu_device *adev, le32_to_cpu(header->ucode_array_offset_bytes); } - memcpy(ucode->kaddr, ucode_addr, ucode->ucode_size); + memcpy_toio(ucode->kaddr, ucode_addr, ucode->ucode_size); return 0; } @@ -986,7 +986,7 @@ static int amdgpu_ucode_patch_jt(struct amdgpu_firmware_info *ucode, src_addr = (uint8_t *)ucode->fw->data + le32_to_cpu(comm_hdr->ucode_array_offset_bytes) + (le32_to_cpu(header->jt_offset) * 4); - memcpy(dst_addr, src_addr, le32_to_cpu(header->jt_size) * 4); + memcpy_toio(dst_addr, src_addr, le32_to_cpu(header->jt_size) * 4); return 0; } @@ -1003,7 +1003,7 @@ int amdgpu_ucode_create_bo(struct amdgpu_device *adev) dev_err(adev->dev, "failed to create kernel buffer for firmware.fw_buf\n"); return -ENOMEM; } else if (amdgpu_sriov_vf(adev)) { - memset(adev->firmware.fw_buf_ptr, 0, adev->firmware.fw_size); + memset_io(adev->firmware.fw_buf_ptr, 0, adev->firmware.fw_size); } } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c index b7441654e6fa73..79c5d0b00cf451 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c @@ -1196,7 +1196,7 @@ int amdgpu_uvd_get_create_msg(struct amdgpu_ring *ring, uint32_t handle, { struct amdgpu_device *adev = ring->adev; struct amdgpu_bo *bo = adev->uvd.ib_bo; - uint32_t *msg; + volatile uint32_t *msg; int i; msg = amdgpu_bo_kptr(bo); @@ -1224,7 +1224,7 @@ int amdgpu_uvd_get_destroy_msg(struct amdgpu_ring *ring, uint32_t handle, { struct amdgpu_device *adev = ring->adev; struct amdgpu_bo *bo = NULL; - uint32_t *msg; + volatile uint32_t *msg; int r, i; if (direct) { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 1943beb135c4c2..0265e975a58502 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -1322,7 +1322,7 @@ static int gfx_v8_0_mec_init(struct amdgpu_device *adev) return r; } - memset(hpd, 0, mec_hpd_size); + memset_io(hpd, 0, mec_hpd_size); amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj); amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj); @@ -4395,7 +4395,7 @@ static int gfx_v8_0_deactivate_hqd(struct amdgpu_device *adev, u32 req) return r; } -static void gfx_v8_0_mqd_set_priority(struct amdgpu_ring *ring, struct vi_mqd *mqd) +static void gfx_v8_0_mqd_set_priority(struct amdgpu_ring *ring, volatile struct vi_mqd *mqd) { struct amdgpu_device *adev = ring->adev; @@ -4411,7 +4411,7 @@ static void gfx_v8_0_mqd_set_priority(struct amdgpu_ring *ring, struct vi_mqd *m static int gfx_v8_0_mqd_init(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; - struct vi_mqd *mqd = ring->mqd_ptr; + volatile struct vi_mqd *mqd = ring->mqd_ptr; uint64_t hqd_gpu_addr, wb_gpu_addr, eop_base_addr; uint32_t tmp; @@ -4422,11 +4422,13 @@ static int gfx_v8_0_mqd_init(struct amdgpu_ring *ring) mqd->compute_static_thread_mgmt_se2 = 0xffffffff; mqd->compute_static_thread_mgmt_se3 = 0xffffffff; mqd->compute_misc_reserved = 0x00000003; + mqd->dynamic_cu_mask_addr_lo = lower_32_bits(ring->mqd_gpu_addr + offsetof(struct vi_mqd_allocation, dynamic_cu_mask)); mqd->dynamic_cu_mask_addr_hi = upper_32_bits(ring->mqd_gpu_addr + offsetof(struct vi_mqd_allocation, dynamic_cu_mask)); eop_base_addr = ring->eop_gpu_addr >> 8; + mqd->cp_hqd_eop_base_addr_lo = eop_base_addr; mqd->cp_hqd_eop_base_addr_hi = upper_32_bits(eop_base_addr); @@ -4602,7 +4604,7 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring) if (amdgpu_in_reset(adev)) { /* for GPU_RESET case */ /* reset MQD to a clean status */ if (adev->gfx.kiq[0].mqd_backup) - memcpy(mqd, adev->gfx.kiq[0].mqd_backup, sizeof(struct vi_mqd_allocation)); + memcpy_toio(mqd, adev->gfx.kiq[0].mqd_backup, sizeof(struct vi_mqd_allocation)); /* reset ring buffer */ ring->wptr = 0; @@ -4613,7 +4615,7 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring) vi_srbm_select(adev, 0, 0, 0, 0); mutex_unlock(&adev->srbm_mutex); } else { - memset((void *)mqd, 0, sizeof(struct vi_mqd_allocation)); + memset_io((void *)mqd, 0, sizeof(struct vi_mqd_allocation)); ((struct vi_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF; ((struct vi_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF; if (amdgpu_sriov_vf(adev) && adev->in_suspend) @@ -4626,7 +4628,7 @@ static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring) mutex_unlock(&adev->srbm_mutex); if (adev->gfx.kiq[0].mqd_backup) - memcpy(adev->gfx.kiq[0].mqd_backup, mqd, sizeof(struct vi_mqd_allocation)); + memcpy_fromio(adev->gfx.kiq[0].mqd_backup, mqd, sizeof(struct vi_mqd_allocation)); } return 0; @@ -4639,7 +4641,7 @@ static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring) int mqd_idx = ring - &adev->gfx.compute_ring[0]; if (!amdgpu_in_reset(adev) && !adev->in_suspend) { - memset((void *)mqd, 0, sizeof(struct vi_mqd_allocation)); + memset_io((void *)mqd, 0, sizeof(struct vi_mqd_allocation)); ((struct vi_mqd_allocation *)mqd)->dynamic_cu_mask = 0xFFFFFFFF; ((struct vi_mqd_allocation *)mqd)->dynamic_rb_mask = 0xFFFFFFFF; mutex_lock(&adev->srbm_mutex); @@ -4649,11 +4651,11 @@ static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring) mutex_unlock(&adev->srbm_mutex); if (adev->gfx.mec.mqd_backup[mqd_idx]) - memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct vi_mqd_allocation)); + memcpy_fromio(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(struct vi_mqd_allocation)); } else { /* restore MQD to a clean status */ if (adev->gfx.mec.mqd_backup[mqd_idx]) - memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation)); + memcpy_toio(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(struct vi_mqd_allocation)); /* reset ring buffer */ ring->wptr = 0; amdgpu_ring_clear_ring(ring); From 3c500269dbdb7cf068740dac2739b22131d59668 Mon Sep 17 00:00:00 2001 From: Coreforge Date: Thu, 23 Nov 2023 16:38:59 +0100 Subject: [PATCH 2/8] some alignment trapping, still wip --- arch/arm64/include/asm/exception.h | 1 + arch/arm64/kernel/compat_alignment.c | 313 ++++++++++++++++++++++++++- arch/arm64/mm/fault.c | 9 + 3 files changed, 322 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index ad688e157c9bed..6241dec7195dab 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -68,6 +68,7 @@ void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs) void bad_el0_sync(struct pt_regs *regs, int reason, unsigned long esr); void do_el0_cp15(unsigned long esr, struct pt_regs *regs); int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs); +int do_alignment_fixup(unsigned long addr, struct pt_regs *regs); void do_el0_svc(struct pt_regs *regs); void do_el0_svc_compat(struct pt_regs *regs); void do_el0_fpac(struct pt_regs *regs, unsigned long esr); diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c index deff21bfa6800c..65b9c19bcf4bd5 100644 --- a/arch/arm64/kernel/compat_alignment.c +++ b/arch/arm64/kernel/compat_alignment.c @@ -318,7 +318,7 @@ int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs) int thumb2_32b = 0; instrptr = instruction_pointer(regs); - + printk("Alignment fixup\n"); if (compat_thumb_mode(regs)) { __le16 __user *ptr = (__le16 __user *)(instrptr & ~1); u16 tinstr, tinst2; @@ -381,3 +381,314 @@ int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs) return 0; } + +// arm64# + +/* + *Happens with The Long Dark + * + *[ 6012.660803] Faulting instruction: 0x3d800020 +[ 6012.660813] Load/Store: op0 0x3 op1 0x1 op2 0x3 op3 0x0 op4 0x0 + */ + +struct fixupDescription{ + void* addr; + // + u64 data1; + u64 data1_simd; + u64 data2; + u64 data2_simd; + + int Rs; // used for atomics (which don't get handled atomically) + + int simd; // wether or not this is a vector instruction + int load; // 1 is it's a load, 0 if it's a store + int pair; // 1 if it's a l/s pair instruction + int width; // width of the access in bits +}; + +static int alignment_get_arm64(struct pt_regs *regs, __le64 __user *ip, u32 *inst) +{ + __le32 instr = 0; + int fault; + + fault = get_user(instr, ip); + if (fault) + return fault; + + *inst = __le32_to_cpu(instr); + return 0; +} + +/*int ldpstp_offset_fixup(u32 instr, struct pt_regs *regs){ + uint8_t load = (instr >> 22) & 1; + uint8_t simd = (instr >> 26) & 1; + uint16_t imm7 = (instr >> 15) & 0x7f; + uint8_t Rt2 = (instr >> 10) & 0x1f; + uint8_t Rn = (instr >> 5) & 0x1f; + uint8_t Rt = instr & 0x1f; + + int16_t imm = 0xffff & imm7; + printk("Variant: 0x%x Load: %x SIMD: %x IMM: 0x%x Rt: 0x%x Rt2: 0x%x Rn: 0x%x\n", ((instr >> 30) & 3),load, simd, imm, Rt, Rt2, Rn); + if(((instr >> 30) & 3) == 2){ + // 64bit + if(!load){ + if(!simd){ + // 64bit store + u64 val1, val2; + val1 = regs->regs[Rt]; + val2 = regs->regs[Rt2]; + u64 addr = regs->regs[Rn] + imm; + printk("STP 64bit storing 0x%llx 0x%llx at 0x%llx\n", val1, val2, addr); + // for the first reg. Byte by byte to avoid any alignment issues + for(int i = 0; i < 8; i++){ + uint8_t v = (val1 >> (i*8)) & 0xff; + put_user(v, (uint8_t __user *)addr); + addr++; + } + // second reg + for(int i = 0; i < 8; i++){ + uint8_t v = (val2 >> (i*8)) & 0xff; + put_user(v, (uint8_t __user *)addr); + addr++; + } + arm64_skip_faulting_instruction(regs, 4); + } + } + } + return 0; +}*/ + +int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + int r; + if(!desc->load){ + uint8_t* addr = desc->addr; + int bcount = desc->width / 8; // since the field stores the width in bits. Honestly, there's no particular reason for that + printk("Storing %d bytes (pair: %d) to 0x%llx",bcount, desc->pair, desc->addr); + for(int i = 0; i < bcount; i++){ + if((r=put_user(desc->data1 & 0xff, (uint8_t __user *)addr))) + return r; + desc->data1 >>= 8; + addr++; + } + + if(desc->pair){ + for(int i = 0; i < bcount; i++){ + if((r=put_user(desc->data2 & 0xff, (uint8_t __user *)addr))) + return r; + desc->data2 >>= 8; + addr++; + } + } + arm64_skip_faulting_instruction(regs, 4); + } else { + printk("Loading is currently not implemented (addr 0x%llx)\n", desc->addr); + return -1; + } + return 0; +} + +int ls_cas_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + uint8_t size = (instr >> 30) & 3; + uint8_t load = (instr >> 22) & 1; // acquire semantics, has no effect here, since it's not atomic anymore + uint8_t Rs = (instr >> 16) & 0x1f; + uint8_t Rt2 = (instr >> 10) & 0x1f; + uint8_t Rn = (instr >> 5) & 0x1f; + uint8_t Rt = instr & 0x1f; + + uint8_t o0 = (instr >> 15) & 1; // L, release semantics, has no effect here, since it's not atomic anymore + + if(Rt2 != 0x1f){ + return -1; + } + + switch(size){ + case 0: + desc->width = 8; + break; + case 1: + desc->width = 16; + break; + case 2: + desc->width = 32; + break; + case 3: + desc->width = 64; + break; + } + + desc->addr = (void*)regs->regs[Rn]; + desc->data1 = regs->regs[Rt]; + + // nearly everything from here on could be moved into another function if needed + u64 cmpmask = (1 << desc->width) - 1; + u64 cmpval = regs->regs[Rs] & cmpmask; + + u64 readval = 0; + int bcount = desc->width / 8; + u64 addr = desc->addr; + int r; + uint8_t tmp; + + printk("Atomic CAS not being done atomically at 0x%llx, size %d\n",desc->addr, desc->width); + + for(int i = 0; i < bcount; i++){ + if((r=get_user(tmp, (uint8_t __user *)addr))) + return r; + readval |= tmp; + readval <<= 8; // maybe this could be read directly into regs->regs[Rs] + addr++; + } + + if((readval & cmpmask) == cmpval){ + // swap + addr = (u64)desc->addr; + + for(int i = 0; i < bcount; i++){ + if((r=put_user(desc->data1 & 0xff, (uint8_t __user *)addr))) + return r; + desc->data1 >>= 8; + addr++; + } + + regs->regs[Rs] = readval; + } + + arm64_skip_faulting_instruction(regs, 4); + + return 0; +} + +int ls_pair_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + uint8_t op2; + uint8_t opc; + op2 = (instr >> 23) & 3; + opc = (instr >> 30) & 3; + + uint8_t load = (instr >> 22) & 1; + uint8_t simd = (instr >> 26) & 1; + uint16_t imm7 = (instr >> 15) & 0x7f; + uint8_t Rt2 = (instr >> 10) & 0x1f; + uint8_t Rn = (instr >> 5) & 0x1f; + uint8_t Rt = instr & 0x1f; + + int16_t imm = 0xffff & imm7; + + desc->load = load; + desc->simd = simd; + + // opc controls the width + switch(opc){ + case 0: + desc->width = 32; + imm <<= 2; + break; + case 2: + desc->width = 64; + imm <<= 3; + break; + default: + return -1; + } + + // op2 controls the indexing + switch(op2){ + case 2: + // offset + desc->addr = (void*)(regs->regs[Rn] + imm); + break; + default: + return -1; + } + desc->data1 = regs->regs[Rt]; + desc->data2 = regs->regs[Rt2]; + + return do_ls_fixup(instr, regs, desc); + +} + +int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + uint8_t size = (instr >> 30) & 3; + uint8_t simd = (instr >> 26) & 1; + uint8_t opc = (instr >> 22) & 3; + + switch(size){ + case 0: + desc->width = 8; + break; + case 1: + desc->width = 16; + break; + case 2: + desc->width = 32; + break; + case 3: + desc->width = 64; + break; + } + return 0; +} + +int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + uint8_t op0; + uint8_t op1; + uint8_t op2; + uint8_t op3; + uint8_t op4; + + op0 = (instr >> 28) & 0xf; + op1 = (instr >> 26) & 1; + op2 = (instr >> 23) & 3; + op3 = (instr >> 16) & 0x3f; + op4 = (instr >> 10) & 3; + printk("Load/Store: op0 0x%x op1 0x%x op2 0x%x op3 0x%x op4 0x%x\n", op0, op1, op2, op3, op4); + if((op0 & 3) == 2){ + desc->pair = 1; + return ls_pair_fixup(instr, regs, desc); + } + if((op0 & 3) == 0 && op1 == 0 && op2 == 1 && (op3 & 0x20) == 0x20){ + // compare and swap + return ls_cas_fixup(instr, regs, desc); + } + if((op0 & 3) == 3 && (op2 & 3) == 3){ + //load/store unsigned immediate + desc->pair = 0; + + } + if((op0 & 3) == 2 && (op2 == 2)){ + // Load/store pair offset + //ldpstp_offset_fixup(instr, regs); + return ls_reg_unsigned_imm(instr, regs, desc); + } + return 0; +} + +int do_alignment_fixup(unsigned long addr, struct pt_regs *regs){ + unsigned long long instrptr; + u32 instr = 0; + + instrptr = instruction_pointer(regs); + printk("Alignment fixup\n"); + + if (alignment_get_arm64(regs, (__le64 __user *)instrptr, &instr)){ + printk("Failed to get aarch64 instruction\n"); + return 1; + } + printk("Faulting instruction: 0x%lx\n", instr); + /** + * List of seen faults: 020c00a9 (0xa9000c02) stp x2, x3, [x0] + * + */ + + uint8_t op0; + struct fixupDescription desc = {0}; + + op0 = ((instr & 0x1E000000) >> 25); + if((op0 & 5) == 0x4){ + printk("Load/Store\n"); + return ls_fixup(instr, regs, &desc); + } else { + printk("Not handling instruction with op0 0x%x ",op0); + } + return -1; +} diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 2e5d1e238af958..b434b681b8f003 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -674,6 +675,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, * We had some memory, but were unable to successfully fix up * this page fault. */ + printk("Page fault bus error\n"); arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name); } else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) { unsigned int lsb; @@ -716,9 +718,16 @@ static int __kprobes do_translation_fault(unsigned long far, static int do_alignment_fault(unsigned long far, unsigned long esr, struct pt_regs *regs) { + //printk("Alignment fault: fixup enabled?: %d, user mode: %d pstate: 0x%llx\n", IS_ENABLED(CONFIG_COMPAT_ALIGNMENT_FIXUPS), compat_user_mode(regs), regs->pstate); + trigger_all_cpu_backtrace(); if (IS_ENABLED(CONFIG_COMPAT_ALIGNMENT_FIXUPS) && compat_user_mode(regs)) return do_compat_alignment_fixup(far, regs); + + if(user_mode(regs)){ + // aarch64 user mode + return do_alignment_fixup(far, regs); + } do_bad_area(far, esr, regs); return 0; } From e835b9450fbfde2563fd14eb574d277181001e61 Mon Sep 17 00:00:00 2001 From: Coreforge Date: Sat, 2 Dec 2023 00:36:29 +0100 Subject: [PATCH 3/8] added 32/64bit str --- arch/arm64/kernel/compat_alignment.c | 129 ++++++++++++++++++++++++--- arch/arm64/mm/fault.c | 4 +- 2 files changed, 121 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c index 65b9c19bcf4bd5..86154c777fa75f 100644 --- a/arch/arm64/kernel/compat_alignment.c +++ b/arch/arm64/kernel/compat_alignment.c @@ -385,10 +385,23 @@ int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs) // arm64# /* - *Happens with The Long Dark + *Happens with The Long Dark (also with steam) * *[ 6012.660803] Faulting instruction: 0x3d800020 [ 6012.660813] Load/Store: op0 0x3 op1 0x1 op2 0x3 op3 0x0 op4 0x0 + * + *[ 555.449651] Load/Store: op0 0x3 op1 0x1 op2 0x1 op3 0x1 op4 0x0 +[ 555.449654] Faulting instruction: 0x3c810021 + * + * + *[ 555.449663] Load/Store: op0 0x3 op1 0x1 op2 0x1 op3 0x2 op4 0x0 +[ 555.449666] Faulting instruction: 0x3c820020 + * + *[ 555.449674] Load/Store: op0 0x3 op1 0x1 op2 0x1 op3 0x3 op4 0x0 +[ 555.449677] Faulting instruction: 0x3c830021 + * + * + * */ struct fixupDescription{ @@ -405,6 +418,8 @@ struct fixupDescription{ int load; // 1 is it's a load, 0 if it's a store int pair; // 1 if it's a l/s pair instruction int width; // width of the access in bits + int extendSign; + int extend_width; }; static int alignment_get_arm64(struct pt_regs *regs, __le64 __user *ip, u32 *inst) @@ -464,7 +479,7 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ if(!desc->load){ uint8_t* addr = desc->addr; int bcount = desc->width / 8; // since the field stores the width in bits. Honestly, there's no particular reason for that - printk("Storing %d bytes (pair: %d) to 0x%llx",bcount, desc->pair, desc->addr); + //printk("Storing %d bytes (pair: %d) to 0x%llx",bcount, desc->pair, desc->addr); for(int i = 0; i < bcount; i++){ if((r=put_user(desc->data1 & 0xff, (uint8_t __user *)addr))) return r; @@ -626,6 +641,84 @@ int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription desc->width = 64; break; } + return 1; +} + + +u64 extend_reg(u64 reg, int type, int shift){ + + uint8_t is_signed = (type & 4) >> 2; + uint8_t input_width = type & 1; + + u64 tmp; + if(!is_signed){ + tmp = reg; + } else { + if(input_width == 0){ + // 32bit, needs to be extended to 64 + // I hope the compiler just does this kind of automatically with these types + int32_t stmpw = reg; + int64_t stmpdw = stmpw; + tmp = (u64)stmpdw; + } + } + + return tmp << shift; +} + +int lsr_offset_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + uint8_t size = (instr >> 30) & 3; + uint8_t simd = (instr >> 26) & 1; + uint8_t opc = (instr >> 22) & 3; + uint8_t option = (instr >> 13) & 5; + uint8_t Rm = (instr >> 16) & 0x1f; + uint8_t Rn = (instr >> 5) & 0x1f; + uint8_t Rt = instr & 0x1f; + uint8_t S = (instr >> 12) & 1; + // size==0 seems to be a bit special + // opc&2 is sign, opc&1 is load (for most instructions anyways) + + uint8_t load = opc & 1; + uint8_t extend_sign = (opc & 2) >> 1; + desc->pair = 0; + + desc->simd = simd; + desc->width = 8 << size; + + // the simd instructions make this a bit weird + if(!simd){ + if(extend_sign){ + if(load){ + desc->extend_width = 32; + } else { + desc->extend_width = 64; + } + desc->load = 1; + } else { + desc->load = load; + } + + desc->extendSign = extend_sign; // needed for load, which isn't implemented yet + + + u64 addr = regs->regs[Rn]; + + int shift = 0; + if(S) shift = 2 << ((size & 1) & ((size >> 1) & 1)); + + u64 offset = extend_reg(regs->regs[Rm], option, S); + + addr += offset; + + desc->data1 = regs->regs[Rt]; + desc->addr = (void*)addr; + + return do_ls_fixup(instr, regs, desc); + + } else { + printk("Load/Store register offset decode doesn't support simd yet\n"); + return 1; + } return 0; } @@ -636,19 +729,21 @@ int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ uint8_t op3; uint8_t op4; + int r = 1; + op0 = (instr >> 28) & 0xf; op1 = (instr >> 26) & 1; op2 = (instr >> 23) & 3; op3 = (instr >> 16) & 0x3f; op4 = (instr >> 10) & 3; - printk("Load/Store: op0 0x%x op1 0x%x op2 0x%x op3 0x%x op4 0x%x\n", op0, op1, op2, op3, op4); + if((op0 & 3) == 2){ desc->pair = 1; - return ls_pair_fixup(instr, regs, desc); + r = ls_pair_fixup(instr, regs, desc); } if((op0 & 3) == 0 && op1 == 0 && op2 == 1 && (op3 & 0x20) == 0x20){ // compare and swap - return ls_cas_fixup(instr, regs, desc); + r = ls_cas_fixup(instr, regs, desc); } if((op0 & 3) == 3 && (op2 & 3) == 3){ //load/store unsigned immediate @@ -658,9 +753,16 @@ int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ if((op0 & 3) == 2 && (op2 == 2)){ // Load/store pair offset //ldpstp_offset_fixup(instr, regs); - return ls_reg_unsigned_imm(instr, regs, desc); + //r = ls_reg_unsigned_imm(instr, regs, desc); } - return 0; + if((op0 & 3) == 3 && (op2 & 2) == 0 && (op3 & 0x20) == 0x20 && op4 == 2){ + // register offset load/store + r = lsr_offset_fixup(instr, regs, desc); + } + if(r){ + printk("Load/Store: op0 0x%x op1 0x%x op2 0x%x op3 0x%x op4 0x%x\n", op0, op1, op2, op3, op4); + } + return r; } int do_alignment_fixup(unsigned long addr, struct pt_regs *regs){ @@ -668,25 +770,30 @@ int do_alignment_fixup(unsigned long addr, struct pt_regs *regs){ u32 instr = 0; instrptr = instruction_pointer(regs); - printk("Alignment fixup\n"); + //printk("Alignment fixup\n"); if (alignment_get_arm64(regs, (__le64 __user *)instrptr, &instr)){ printk("Failed to get aarch64 instruction\n"); return 1; } - printk("Faulting instruction: 0x%lx\n", instr); + /** * List of seen faults: 020c00a9 (0xa9000c02) stp x2, x3, [x0] * */ uint8_t op0; + int r; struct fixupDescription desc = {0}; op0 = ((instr & 0x1E000000) >> 25); if((op0 & 5) == 0x4){ - printk("Load/Store\n"); - return ls_fixup(instr, regs, &desc); + //printk("Load/Store\n"); + r = ls_fixup(instr, regs, &desc); + if(r){ + printk("Faulting instruction: 0x%lx\n", instr); + } + return r; } else { printk("Not handling instruction with op0 0x%x ",op0); } diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index b434b681b8f003..17e0abc243d811 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -726,7 +726,9 @@ static int do_alignment_fault(unsigned long far, unsigned long esr, if(user_mode(regs)){ // aarch64 user mode - return do_alignment_fixup(far, regs); + if(!do_alignment_fixup(far, regs)){ + return 0; + } } do_bad_area(far, esr, regs); return 0; From 43969b6a05b5b105e249476734a8612d082a872f Mon Sep 17 00:00:00 2001 From: Coreforge Date: Wed, 6 Dec 2023 17:52:45 +0100 Subject: [PATCH 4/8] some SIMD stuff for unity (not quite enough yet) --- arch/arm64/kernel/compat_alignment.c | 79 ++++++++++++++++++++++++++-- arch/arm64/mm/fault.c | 3 +- 2 files changed, 76 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c index 86154c777fa75f..072f5ff02c6b68 100644 --- a/arch/arm64/kernel/compat_alignment.c +++ b/arch/arm64/kernel/compat_alignment.c @@ -12,6 +12,8 @@ #include #include +#include + /* * 32-bit misaligned trap handler (c) 1998 San Mehat (CCC) -July 1998 * @@ -399,6 +401,9 @@ int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs) * *[ 555.449674] Load/Store: op0 0x3 op1 0x1 op2 0x1 op3 0x3 op4 0x0 [ 555.449677] Faulting instruction: 0x3c830021 + +stur q1, [x1, #16] +potentially also ldur q0, [x1, #32] and ldur q1, [x1, #48] * * * @@ -406,7 +411,8 @@ int do_compat_alignment_fixup(unsigned long addr, struct pt_regs *regs) struct fixupDescription{ void* addr; - // + + // datax_simd has to be located directly after datax in memory u64 data1; u64 data1_simd; u64 data2; @@ -474,24 +480,40 @@ static int alignment_get_arm64(struct pt_regs *regs, __le64 __user *ip, u32 *ins return 0; }*/ +// saves the contents of the simd register reg to dst +void read_simd_reg(int reg, __uint128_t* dst){ + struct user_fpsimd_state st; + fpsimd_save_state(&st); + *dst = st.vregs[reg]; +} + int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ int r; + /*if(desc->width > 64){ + printk("Currently cannot process ls_fixup with a size of %d bits\n", desc->width); + return 1; + }*/ if(!desc->load){ uint8_t* addr = desc->addr; int bcount = desc->width / 8; // since the field stores the width in bits. Honestly, there's no particular reason for that + //printk("Storing %d bytes (pair: %d) to 0x%llx",bcount, desc->pair, desc->addr); + int addrIt = 0; for(int i = 0; i < bcount; i++){ - if((r=put_user(desc->data1 & 0xff, (uint8_t __user *)addr))) + if((r=put_user( (*(((uint8_t*)(&desc->data1)) + addrIt) & 0xff), (uint8_t __user *)addr))) return r; - desc->data1 >>= 8; + //desc->data1 >>= 8; + addrIt++; addr++; } + addrIt = 0; if(desc->pair){ for(int i = 0; i < bcount; i++){ - if((r=put_user(desc->data2 & 0xff, (uint8_t __user *)addr))) + if((r=put_user((*(((uint8_t*)(&desc->data2)) + addrIt) & 0xff) & 0xff, (uint8_t __user *)addr))) return r; - desc->data2 >>= 8; + //desc->data2 >>= 8; + addrIt++; addr++; } } @@ -722,6 +744,43 @@ int lsr_offset_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* d return 0; } +int lsr_unscaled_immediate_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ + uint8_t size = (instr >> 30) & 3; + uint8_t simd = (instr >> 26) & 1; + uint8_t opc = (instr >> 22) & 3; + uint16_t imm9 = (instr >> 12) & 0x1ff; + uint8_t Rn = (instr >> 5) & 0x1f; + uint8_t Rt = instr & 0x1f; + + int16_t fullImm = 0; + // sign extend it + if(imm9 & 0x100){ + fullImm = 0xfe00 | imm9; + } else { + fullImm = imm9; + } + u64 addr = regs->regs[Rn]; + desc->addr = addr + fullImm; + desc->pair = 0; + + int load = opc & 1; + if(load){ + return 1; + } + if(simd){ + desc->simd = 1; + desc->width = 8 << (size | (opc << 1)); + // assuming store + __uint128_t tmp; + read_simd_reg(Rt, &tmp); + desc->data1 = tmp; + desc->data1_simd = *(((u64*)&tmp) + 1); + return do_ls_fixup(instr, regs, desc); + } + printk("SIMD: %d\n", simd); + return 1; +} + int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ uint8_t op0; uint8_t op1; @@ -759,6 +818,16 @@ int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ // register offset load/store r = lsr_offset_fixup(instr, regs, desc); } + if((op0 & 3) == 3 && (op2 & 2) == 0 && (op3 & 0x20) == 0x0 && op4 == 0){ + // register load/store unscaled immediate + r = lsr_unscaled_immediate_fixup(instr, regs, desc); + printk("Likely SIMD stuff, which isn't being handled properly at all!\n"); + if(r){ + arm64_skip_faulting_instruction(regs, 4); + // skip anyways + } + //r = 0; + } if(r){ printk("Load/Store: op0 0x%x op1 0x%x op2 0x%x op3 0x%x op4 0x%x\n", op0, op1, op2, op3, op4); } diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 17e0abc243d811..9247c958621a75 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -726,9 +726,10 @@ static int do_alignment_fault(unsigned long far, unsigned long esr, if(user_mode(regs)){ // aarch64 user mode - if(!do_alignment_fixup(far, regs)){ + if(do_alignment_fixup(far, regs) == 0){ return 0; } + printk("Unfixed alignment issue\n"); } do_bad_area(far, esr, regs); return 0; From b040025bd940aeadf2b4749b24f3f14de2cf3573 Mon Sep 17 00:00:00 2001 From: Coreforge Date: Fri, 8 Dec 2023 22:09:12 +0100 Subject: [PATCH 5/8] better simd fixup (still not entirely working) --- arch/arm64/kernel/compat_alignment.c | 176 +++++++++++++++++---------- 1 file changed, 109 insertions(+), 67 deletions(-) diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c index 072f5ff02c6b68..25bbf992c5e452 100644 --- a/arch/arm64/kernel/compat_alignment.c +++ b/arch/arm64/kernel/compat_alignment.c @@ -13,6 +13,8 @@ #include #include +#include +#include /* * 32-bit misaligned trap handler (c) 1998 San Mehat (CCC) -July 1998 @@ -413,10 +415,13 @@ struct fixupDescription{ void* addr; // datax_simd has to be located directly after datax in memory - u64 data1; + /*u64 data1; u64 data1_simd; u64 data2; - u64 data2_simd; + u64 data2_simd;*/ + + int reg1; + int reg2; int Rs; // used for atomics (which don't get handled atomically) @@ -481,14 +486,42 @@ static int alignment_get_arm64(struct pt_regs *regs, __le64 __user *ip, u32 *ins }*/ // saves the contents of the simd register reg to dst -void read_simd_reg(int reg, __uint128_t* dst){ - struct user_fpsimd_state st; - fpsimd_save_state(&st); - *dst = st.vregs[reg]; +void read_simd_reg(int reg, u64 dst[2]){ + struct user_fpsimd_state st = {0}; + //fpsimd_save_state(&st); + + if(!may_use_simd()){ + printk("may_use_simd returned false!\n"); + } + kernel_neon_begin(); + if(current->thread.sve_state){ + printk("SVE state is not NULL!\n"); + } + + dst[0] = *((u64*)(¤t->thread.uw.fpsimd_state.vregs[reg])); + dst[1] = *(((u64*)(¤t->thread.uw.fpsimd_state.vregs[reg])) + 1); + + kernel_neon_end(); } int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ int r; + u64 data1[2]; + u64 data2[2]; + + // the reg indices have to always be valid, even if the reg isn't being used + if(desc->simd){ + // At least currently, there aren't any simd instructions supported that use more than one data register + //__uint128_t tmp; + read_simd_reg(desc->reg1, data1); + //data1[0] = tmp; + //data1[1] = *(((u64*)&tmp) + 1); + printk("SIMD: storing 0x%llx %llx (%d bits) at 0x%px", data1[1], data1[0], desc->width, desc->addr); + } else { + data1[0] = regs->regs[desc->reg1]; + data2[0] = regs->regs[desc->reg2]; + } + /*if(desc->width > 64){ printk("Currently cannot process ls_fixup with a size of %d bits\n", desc->width); return 1; @@ -500,8 +533,10 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ //printk("Storing %d bytes (pair: %d) to 0x%llx",bcount, desc->pair, desc->addr); int addrIt = 0; for(int i = 0; i < bcount; i++){ - if((r=put_user( (*(((uint8_t*)(&desc->data1)) + addrIt) & 0xff), (uint8_t __user *)addr))) + if((r=put_user( (*(((uint8_t*)(data1)) + addrIt) & 0xff), (uint8_t __user *)addr))){ + printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); return r; + } //desc->data1 >>= 8; addrIt++; addr++; @@ -510,8 +545,10 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ addrIt = 0; if(desc->pair){ for(int i = 0; i < bcount; i++){ - if((r=put_user((*(((uint8_t*)(&desc->data2)) + addrIt) & 0xff) & 0xff, (uint8_t __user *)addr))) + if((r=put_user((*(((uint8_t*)(data2)) + addrIt) & 0xff) & 0xff, (uint8_t __user *)addr))){ + printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); return r; + } //desc->data2 >>= 8; addrIt++; addr++; @@ -519,7 +556,7 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ } arm64_skip_faulting_instruction(regs, 4); } else { - printk("Loading is currently not implemented (addr 0x%llx)\n", desc->addr); + printk("Loading is currently not implemented (addr 0x%px)\n", desc->addr); return -1; } return 0; @@ -555,7 +592,7 @@ int ls_cas_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc) } desc->addr = (void*)regs->regs[Rn]; - desc->data1 = regs->regs[Rt]; + u64 data1 = regs->regs[Rt]; // nearly everything from here on could be moved into another function if needed u64 cmpmask = (1 << desc->width) - 1; @@ -567,7 +604,7 @@ int ls_cas_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc) int r; uint8_t tmp; - printk("Atomic CAS not being done atomically at 0x%llx, size %d\n",desc->addr, desc->width); + printk("Atomic CAS not being done atomically at 0x%px, size %d\n",desc->addr, desc->width); for(int i = 0; i < bcount; i++){ if((r=get_user(tmp, (uint8_t __user *)addr))) @@ -582,9 +619,9 @@ int ls_cas_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc) addr = (u64)desc->addr; for(int i = 0; i < bcount; i++){ - if((r=put_user(desc->data1 & 0xff, (uint8_t __user *)addr))) + if((r=put_user(data1 & 0xff, (uint8_t __user *)addr))) return r; - desc->data1 >>= 8; + data1 >>= 8; addr++; } @@ -637,8 +674,10 @@ int ls_pair_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc default: return -1; } - desc->data1 = regs->regs[Rt]; - desc->data2 = regs->regs[Rt2]; + //desc->data1 = regs->regs[Rt]; + //desc->data2 = regs->regs[Rt2]; + desc->reg1 = Rt; + desc->reg2 = Rt2; return do_ls_fixup(instr, regs, desc); @@ -648,22 +687,29 @@ int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription uint8_t size = (instr >> 30) & 3; uint8_t simd = (instr >> 26) & 1; uint8_t opc = (instr >> 22) & 3; + uint16_t imm12 = (instr >> 10) & 0xfff; + uint8_t Rn = (instr >> 5) & 0x1f; + uint8_t Rt = instr & 0x1f; - switch(size){ - case 0: - desc->width = 8; - break; - case 1: - desc->width = 16; - break; - case 2: - desc->width = 32; - break; - case 3: - desc->width = 64; - break; + uint8_t load = opc & 1; + uint8_t extend_sign = ((opc & 2) >> 1 ) & !simd; + printk("size: %d simd: %d opc: %d imm12: 0x%x Rn: %d Rt: %d\n", size, simd, opc, imm12, Rn, Rt); + // when in simd mode, opc&2 is a third size bit. Otherwise, it's there for sign extension + int width_shift = (size | (((opc & 2) & (simd << 1)) << 1)); + desc->width = 8 << width_shift; + + if((size & 1) && simd && (opc & 2)){ + return 1; } - return 1; + + desc->reg1 = Rt; + desc->simd = simd; + desc->extendSign = extend_sign; + u64 addr = regs->regs[Rn]; + desc->addr = addr + (imm12 << width_shift); + printk("unsigned imm\n"); + + return do_ls_fixup(instr, regs, desc); } @@ -697,50 +743,52 @@ int lsr_offset_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* d uint8_t Rn = (instr >> 5) & 0x1f; uint8_t Rt = instr & 0x1f; uint8_t S = (instr >> 12) & 1; + int width_shift = (size | (((opc & 2) & (simd << 1)) << 1)); // size==0 seems to be a bit special // opc&2 is sign, opc&1 is load (for most instructions anyways) uint8_t load = opc & 1; - uint8_t extend_sign = (opc & 2) >> 1; + uint8_t extend_sign = ((opc & 2) >> 1 ) & !simd; desc->pair = 0; desc->simd = simd; - desc->width = 8 << size; + desc->width = 8 << width_shift; // the simd instructions make this a bit weird - if(!simd){ - if(extend_sign){ - if(load){ - desc->extend_width = 32; - } else { - desc->extend_width = 64; - } - desc->load = 1; + if(extend_sign){ + if(load){ + desc->extend_width = 32; } else { - desc->load = load; + desc->extend_width = 64; } + desc->load = 1; + } else { + desc->load = load; + } - desc->extendSign = extend_sign; // needed for load, which isn't implemented yet - - - u64 addr = regs->regs[Rn]; + desc->extendSign = extend_sign; // needed for load, which isn't implemented yet + u64 offset = 0; + u64 addr = 0; + addr = regs->regs[Rn]; + if(simd){ + int shift = 0; + if(S) shift = width_shift; + offset = extend_reg(regs->regs[Rm], option, shift); + } else { int shift = 0; if(S) shift = 2 << ((size & 1) & ((size >> 1) & 1)); - u64 offset = extend_reg(regs->regs[Rm], option, S); - - addr += offset; + offset = extend_reg(regs->regs[Rm], option, shift); + } - desc->data1 = regs->regs[Rt]; - desc->addr = (void*)addr; + addr += offset; - return do_ls_fixup(instr, regs, desc); + //desc->data1 = regs->regs[Rt]; + desc->reg1 = Rt; + desc->addr = (void*)addr; - } else { - printk("Load/Store register offset decode doesn't support simd yet\n"); - return 1; - } + return do_ls_fixup(instr, regs, desc); return 0; } @@ -767,14 +815,15 @@ int lsr_unscaled_immediate_fixup(u32 instr, struct pt_regs *regs, struct fixupDe if(load){ return 1; } + desc->reg1 = Rt; if(simd){ desc->simd = 1; - desc->width = 8 << (size | (opc << 1)); + desc->width = 8 << (size | ((opc & 2) << 1)); // assuming store - __uint128_t tmp; + /*__uint128_t tmp; read_simd_reg(Rt, &tmp); desc->data1 = tmp; - desc->data1_simd = *(((u64*)&tmp) + 1); + desc->data1_simd = *(((u64*)&tmp) + 1);*/ return do_ls_fixup(instr, regs, desc); } printk("SIMD: %d\n", simd); @@ -809,10 +858,9 @@ int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ desc->pair = 0; } - if((op0 & 3) == 2 && (op2 == 2)){ - // Load/store pair offset - //ldpstp_offset_fixup(instr, regs); - //r = ls_reg_unsigned_imm(instr, regs, desc); + if((op0 & 3) == 3 && ((op2 & 2) == 2)){ + // register unsigned immediate + r = ls_reg_unsigned_imm(instr, regs, desc); } if((op0 & 3) == 3 && (op2 & 2) == 0 && (op3 & 0x20) == 0x20 && op4 == 2){ // register offset load/store @@ -821,12 +869,6 @@ int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ if((op0 & 3) == 3 && (op2 & 2) == 0 && (op3 & 0x20) == 0x0 && op4 == 0){ // register load/store unscaled immediate r = lsr_unscaled_immediate_fixup(instr, regs, desc); - printk("Likely SIMD stuff, which isn't being handled properly at all!\n"); - if(r){ - arm64_skip_faulting_instruction(regs, 4); - // skip anyways - } - //r = 0; } if(r){ printk("Load/Store: op0 0x%x op1 0x%x op2 0x%x op3 0x%x op4 0x%x\n", op0, op1, op2, op3, op4); From 1c9fba38da19d91db3d5bc586f8242b3298462b7 Mon Sep 17 00:00:00 2001 From: Coreforge Date: Sat, 13 Jan 2024 00:29:56 +0100 Subject: [PATCH 6/8] SIMD stp --- arch/arm64/kernel/compat_alignment.c | 103 +++++++++++++++++++++------ 1 file changed, 83 insertions(+), 20 deletions(-) diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c index 25bbf992c5e452..0412dec7d7c867 100644 --- a/arch/arm64/kernel/compat_alignment.c +++ b/arch/arm64/kernel/compat_alignment.c @@ -446,6 +446,15 @@ static int alignment_get_arm64(struct pt_regs *regs, __le64 __user *ip, u32 *ins return 0; } +int64_t extend_sign(int64_t in, int bits){ + bits--; + if(in & (1 << bits)){ + // extend sign + return (0xffffffffffffffff << bits) | in; + } + return in; +} + /*int ldpstp_offset_fixup(u32 instr, struct pt_regs *regs){ uint8_t load = (instr >> 22) & 1; uint8_t simd = (instr >> 26) & 1; @@ -513,10 +522,16 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ if(desc->simd){ // At least currently, there aren't any simd instructions supported that use more than one data register //__uint128_t tmp; + + // probably better for performance to read both registers with one function to kernel_neon_* doesn't have to be called more than once read_simd_reg(desc->reg1, data1); + read_simd_reg(desc->reg2, data2); //data1[0] = tmp; //data1[1] = *(((u64*)&tmp) + 1); - printk("SIMD: storing 0x%llx %llx (%d bits) at 0x%px", data1[1], data1[0], desc->width, desc->addr); + ///printk("SIMD: storing 0x%llx %llx (%d bits) at 0x%px", data1[1], data1[0], desc->width, desc->addr); + if(desc->width < 128){ + return -1; + } } else { data1[0] = regs->regs[desc->reg1]; data2[0] = regs->regs[desc->reg2]; @@ -646,23 +661,29 @@ int ls_pair_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc uint8_t Rn = (instr >> 5) & 0x1f; uint8_t Rt = instr & 0x1f; - int16_t imm = 0xffff & imm7; - + int64_t imm = extend_sign(imm7, 7); + int immshift = 0; desc->load = load; desc->simd = simd; // opc controls the width - switch(opc){ - case 0: - desc->width = 32; - imm <<= 2; - break; - case 2: - desc->width = 64; - imm <<= 3; - break; - default: - return -1; + if(simd){ + desc->width = 32 << opc; + immshift = 4 << opc; + imm <<= immshift; + } else { + switch(opc){ + case 0: + desc->width = 32; + imm <<= 2; + break; + case 2: + desc->width = 64; + imm <<= 3; + break; + default: + return -1; + } } // op2 controls the indexing @@ -687,15 +708,25 @@ int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription uint8_t size = (instr >> 30) & 3; uint8_t simd = (instr >> 26) & 1; uint8_t opc = (instr >> 22) & 3; - uint16_t imm12 = (instr >> 10) & 0xfff; + uint64_t imm12 = (instr >> 10) & 0xfff; uint8_t Rn = (instr >> 5) & 0x1f; uint8_t Rt = instr & 0x1f; uint8_t load = opc & 1; - uint8_t extend_sign = ((opc & 2) >> 1 ) & !simd; - printk("size: %d simd: %d opc: %d imm12: 0x%x Rn: %d Rt: %d\n", size, simd, opc, imm12, Rn, Rt); + uint8_t extend_sign = 0;// = ((opc & 2) >> 1 ) & !simd; + int width_shift = 0; + + if(simd){ + extend_sign = 0; + width_shift = size | ((opc & 2) << 1); + } else { + extend_sign = ((opc & 2) >> 1 ); + width_shift = size; + } + + ///printk("size: %d simd: %d opc: %d imm12: 0x%x Rn: %d Rt: %d\n", size, simd, opc, imm12, Rn, Rt); // when in simd mode, opc&2 is a third size bit. Otherwise, it's there for sign extension - int width_shift = (size | (((opc & 2) & (simd << 1)) << 1)); + //width_shift = (size | (((opc & 2) & (simd << 1)) << 1)); desc->width = 8 << width_shift; if((size & 1) && simd && (opc & 2)){ @@ -707,7 +738,7 @@ int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription desc->extendSign = extend_sign; u64 addr = regs->regs[Rn]; desc->addr = addr + (imm12 << width_shift); - printk("unsigned imm\n"); + ///printk("unsigned imm\n"); return do_ls_fixup(instr, regs, desc); } @@ -728,9 +759,14 @@ u64 extend_reg(u64 reg, int type, int shift){ int32_t stmpw = reg; int64_t stmpdw = stmpw; tmp = (u64)stmpdw; + } else { + printk("Other branch I forgor about previously!\n"); + tmp = reg; // since the size stays the same, I don't think this makes a difference } } + ///printk("extend_reg: reg 0x%lx out (before shift) 0x%lx signed: %x\n", reg, tmp, is_signed); + return tmp << shift; } @@ -826,7 +862,7 @@ int lsr_unscaled_immediate_fixup(u32 instr, struct pt_regs *regs, struct fixupDe desc->data1_simd = *(((u64*)&tmp) + 1);*/ return do_ls_fixup(instr, regs, desc); } - printk("SIMD: %d\n", simd); + ///printk("SIMD: %d\n", simd); return 1; } @@ -876,6 +912,31 @@ int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ return r; } +uint32_t* seenCMDs; +size_t seenCMDCount = 0; +size_t seenCMDSize = 0; + +void instrDBG(u32 instr){ + for(size_t i = 0; i < seenCMDCount; i++){ + if(seenCMDs[i] == instr){ + return; + } + } + if(seenCMDSize == 0){ + seenCMDs = krealloc(seenCMDs, 1, GFP_KERNEL); + seenCMDSize = 1; + } + + if(seenCMDCount >= seenCMDSize){ + seenCMDs = krealloc(seenCMDs, seenCMDSize*2, GFP_KERNEL); + seenCMDSize *= 2; + } + + seenCMDs[seenCMDCount] = instr; + seenCMDCount++; + printk("New instruction: %x", instr); +} + int do_alignment_fixup(unsigned long addr, struct pt_regs *regs){ unsigned long long instrptr; u32 instr = 0; @@ -893,6 +954,8 @@ int do_alignment_fixup(unsigned long addr, struct pt_regs *regs){ * */ + instrDBG(instr); + uint8_t op0; int r; struct fixupDescription desc = {0}; From c72c024779f20e57c2994eee293a45e733e6e6b6 Mon Sep 17 00:00:00 2001 From: Coreforge Date: Mon, 15 Jan 2024 20:37:21 +0100 Subject: [PATCH 7/8] ldr, DOOM Eternal now works --- arch/arm64/kernel/compat_alignment.c | 114 +++++++++++++++++++++------ 1 file changed, 92 insertions(+), 22 deletions(-) diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c index 0412dec7d7c867..b5f7a728b7155e 100644 --- a/arch/arm64/kernel/compat_alignment.c +++ b/arch/arm64/kernel/compat_alignment.c @@ -513,28 +513,47 @@ void read_simd_reg(int reg, u64 dst[2]){ kernel_neon_end(); } + +void write_simd_reg(int reg, u64 src[2]){ + + if(!may_use_simd()){ + printk("may_use_simd returned false!\n"); + } + kernel_neon_begin(); + if(current->thread.sve_state){ + printk("SVE state is not NULL!\n"); + } + + *((u64*)(¤t->thread.uw.fpsimd_state.vregs[reg])) = src[0]; + *(((u64*)(¤t->thread.uw.fpsimd_state.vregs[reg])) + 1) = src[1]; + + kernel_neon_end(); +} + int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ int r; - u64 data1[2]; - u64 data2[2]; + u64 data1[2] = {0,0}; + u64 data2[2] = {0,0}; // the reg indices have to always be valid, even if the reg isn't being used - if(desc->simd){ - // At least currently, there aren't any simd instructions supported that use more than one data register - //__uint128_t tmp; - - // probably better for performance to read both registers with one function to kernel_neon_* doesn't have to be called more than once - read_simd_reg(desc->reg1, data1); - read_simd_reg(desc->reg2, data2); - //data1[0] = tmp; - //data1[1] = *(((u64*)&tmp) + 1); - ///printk("SIMD: storing 0x%llx %llx (%d bits) at 0x%px", data1[1], data1[0], desc->width, desc->addr); - if(desc->width < 128){ - return -1; + if(!desc->load){ + if(desc->simd){ + // At least currently, there aren't any simd instructions supported that use more than one data register + //__uint128_t tmp; + + // probably better for performance to read both registers with one function to kernel_neon_* doesn't have to be called more than once + read_simd_reg(desc->reg1, data1); + read_simd_reg(desc->reg2, data2); + //data1[0] = tmp; + //data1[1] = *(((u64*)&tmp) + 1); + ///printk("SIMD: storing 0x%llx %llx (%d bits) at 0x%px", data1[1], data1[0], desc->width, desc->addr); + /*if(desc->width < 128){ + return -1; + }*/ + } else { + data1[0] = regs->regs[desc->reg1]; + data2[0] = regs->regs[desc->reg2]; } - } else { - data1[0] = regs->regs[desc->reg1]; - data2[0] = regs->regs[desc->reg2]; } /*if(desc->width > 64){ @@ -571,8 +590,54 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ } arm64_skip_faulting_instruction(regs, 4); } else { - printk("Loading is currently not implemented (addr 0x%px)\n", desc->addr); - return -1; + //printk("Loading is currently not implemented (addr 0x%px)\n", desc->addr); + + uint8_t* addr = desc->addr; + int bcount = desc->width / 8; // since the field stores the width in bits. Honestly, there's no particular reason for that + + //printk("Storing %d bytes (pair: %d) to 0x%llx",bcount, desc->pair, desc->addr); + int addrIt = 0; + for(int i = 0; i < bcount; i++){ + uint8_t val; + if((r=get_user( val, (uint8_t __user *)addr))){ + printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); + return r; + } + *(((uint8_t*)data1) + addrIt) = val; + //desc->data1 >>= 8; + addrIt++; + addr++; + } + + if(desc->simd){ + write_simd_reg(desc->reg1, data1); + } else { + regs->regs[desc->reg1] = data1[0]; + } + + addrIt = 0; + if(desc->pair){ + for(int i = 0; i < bcount; i++){ + uint8_t val; + if((r=get_user(val, (uint8_t __user *)addr))){ + printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); + return r; + } + *(((uint8_t*)data2) + addrIt) = val; + //desc->data2 >>= 8; + addrIt++; + addr++; + } + + if(desc->simd){ + write_simd_reg(desc->reg2, data1); + } else { + regs->regs[desc->reg2] = data1[0]; + } + } + arm64_skip_faulting_instruction(regs, 4); + + } return 0; } @@ -732,7 +797,7 @@ int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription if((size & 1) && simd && (opc & 2)){ return 1; } - + desc->load = load; desc->reg1 = Rt; desc->simd = simd; desc->extendSign = extend_sign; @@ -848,9 +913,10 @@ int lsr_unscaled_immediate_fixup(u32 instr, struct pt_regs *regs, struct fixupDe desc->pair = 0; int load = opc & 1; - if(load){ + desc->load = load; + /*if(load){ return 1; - } + }*/ desc->reg1 = Rt; if(simd){ desc->simd = 1; @@ -861,6 +927,10 @@ int lsr_unscaled_immediate_fixup(u32 instr, struct pt_regs *regs, struct fixupDe desc->data1 = tmp; desc->data1_simd = *(((u64*)&tmp) + 1);*/ return do_ls_fixup(instr, regs, desc); + } else { + desc->simd = 0; + desc->width = 8 << size; + return do_ls_fixup(instr, regs, desc); } ///printk("SIMD: %d\n", simd); return 1; From 168bfdc649d2059e9d5c79bb83c5bea222f2476d Mon Sep 17 00:00:00 2001 From: Coreforge Date: Fri, 4 Oct 2024 17:48:44 +0200 Subject: [PATCH 8/8] some more alignment things --- arch/arm64/kernel/compat_alignment.c | 283 ++++++++++++++++++++++++--- 1 file changed, 260 insertions(+), 23 deletions(-) diff --git a/arch/arm64/kernel/compat_alignment.c b/arch/arm64/kernel/compat_alignment.c index b5f7a728b7155e..eafe781737fbec 100644 --- a/arch/arm64/kernel/compat_alignment.c +++ b/arch/arm64/kernel/compat_alignment.c @@ -411,6 +411,8 @@ potentially also ldur q0, [x1, #32] and ldur q1, [x1, #48] * */ +#include + struct fixupDescription{ void* addr; @@ -431,9 +433,14 @@ struct fixupDescription{ int width; // width of the access in bits int extendSign; int extend_width; + + // profiling + u64 starttime; + u64 decodedtime; + u64 endtime; }; -static int alignment_get_arm64(struct pt_regs *regs, __le64 __user *ip, u32 *inst) +__attribute__((always_inline)) inline static int alignment_get_arm64(struct pt_regs *regs, __le64 __user *ip, u32 *inst) { __le32 instr = 0; int fault; @@ -446,7 +453,7 @@ static int alignment_get_arm64(struct pt_regs *regs, __le64 __user *ip, u32 *ins return 0; } -int64_t extend_sign(int64_t in, int bits){ +__attribute__((always_inline)) inline int64_t extend_sign(int64_t in, int bits){ bits--; if(in & (1 << bits)){ // extend sign @@ -495,7 +502,7 @@ int64_t extend_sign(int64_t in, int bits){ }*/ // saves the contents of the simd register reg to dst -void read_simd_reg(int reg, u64 dst[2]){ +__attribute__((always_inline)) inline void read_simd_reg(int reg, u64 dst[2]){ struct user_fpsimd_state st = {0}; //fpsimd_save_state(&st); @@ -514,7 +521,7 @@ void read_simd_reg(int reg, u64 dst[2]){ } -void write_simd_reg(int reg, u64 src[2]){ +__attribute__((always_inline)) inline void write_simd_reg(int reg, u64 src[2]){ if(!may_use_simd()){ printk("may_use_simd returned false!\n"); @@ -530,11 +537,228 @@ void write_simd_reg(int reg, u64 src[2]){ kernel_neon_end(); } +// these try to use larger access widths than single bytes. Slower for small loads/stores, but it might speed larger ones up + +__attribute__((always_inline)) inline int put_data2(int size, uint8_t* data, void* addr){ + int r = 0; + + while(size){ + if(size >= 4 && (((u64)addr % 4) == 0)){ + if((r=put_user( (*(((uint32_t*)(data)))), (uint32_t __user *)addr))){ + printk("Failed to write data at 0x%px (%d)\n", addr,r); + return r; + } + addr += 4; + data += 4; + size -= 4; + continue; + } + if(size >= 2 && (((u64)addr % 2) == 0)){ + if((r=put_user( (*(((uint16_t*)(data)))), (uint16_t __user *)addr))){ + printk("Failed to write data at 0x%px (%d)\n", addr,r); + return r; + } + addr += 2; + data += 2; + size -= 2; + continue; + } + // I guess the if is redundant here + if(size >= 1){ + if((r=put_user( (*(((uint8_t*)(data)))), (uint8_t __user *)addr))){ + printk("Failed to write data at 0x%px (%d)\n", addr,r); + return r; + } + addr += 1; + data += 1; + size -= 1; + continue; + } + + } + + return r; +} + +__attribute__((always_inline)) inline int get_data2(int size, uint8_t* data, void* addr){ + int r = 0; + uint32_t val32; + uint16_t val16; + uint8_t val8; + while(size){ + if(size >= 4 && (((u64)addr % 4) == 0)){ + if((r=get_user( val32, (uint32_t __user *)addr))){ + printk("Failed to read data at 0x%px\n", addr); + return r; + } + *((uint32_t*)data) = val32; + addr += 4; + data += 4; + size -= 4; + continue; + } + if(size >= 2 && (((u64)addr % 2) == 0)){ + if((r=get_user( val16, (uint16_t __user *)addr))){ + printk("Failed to read data at 0x%px\n", addr); + return r; + } + *((uint16_t*)data) = val16; + addr += 2; + data += 2; + size -= 2; + continue; + } + // I guess the if is redundant here + if(size >= 1){ + if((r=get_user( val8, (uint8_t __user *)addr))){ + printk("Failed to read data at 0x%px\n", addr); + return r; + } + *((uint8_t*)data) = val8; + addr += 1; + data += 1; + size -= 1; + continue; + } + + } + + return r; +} + + +// these should avoid some branching, but still use single byte accesses +__attribute__((always_inline)) inline int put_data(int size, uint8_t* data, void* addr){ + int r = 0; + int addrIt = 0; + + // with the fixed size loops, the compiler should be able to unroll them + // this should mean a lot less branching + switch(size){ + case 16: + for(int i = 0; i < 8; i++){ + if((r=put_user( (*(((uint8_t*)(data)) + addrIt) & 0xff), (uint8_t __user *)addr))){ + printk("Failed to write data at 0x%px\n", addr); + return r; + } + addrIt++; + addr++; + } + // fall through + case 8: + for(int i = 0; i < 4; i++){ + if((r=put_user( (*(data + addrIt) & 0xff), (uint8_t __user *)addr))){ + printk("Failed to write data at 0x%px\n", addr); + return r; + } + addrIt++; + addr++; + } + // fall through + case 4: + for(int i = 0; i < 2; i++){ + if((r=put_user( (*(data + addrIt) & 0xff), (uint8_t __user *)addr))){ + printk("Failed to write data at 0x%px\n", addr); + return r; + } + addrIt++; + addr++; + } + // fall through + case 2: + if((r=put_user( (*(data + addrIt) & 0xff), (uint8_t __user *)addr))){ + printk("Failed to write data at 0x%px\n", addr); + return r; + } + addrIt++; + addr++; + // fall through + case 1: + if((r=put_user( (*(data + addrIt) & 0xff), (uint8_t __user *)addr))){ + printk("Failed to write data at 0x%px\n", addr); + return r; + } + addrIt++; + addr++; + break; + default: + printk("unsupported size %d\n", size); + } + + return r; +} + +__attribute__((always_inline)) inline int get_data(int size, uint8_t* data, void* addr){ + int r = 0; + int addrIt = 0; + + // with the fixed size loops, the compiler should be able to unroll them + // this should mean a lot less branching + uint8_t val; + switch(size){ + case 16: + for(int i = 0; i < 8; i++){ + if((r=get_user( val, (uint8_t __user *)addr))){ + printk("Failed to read data at 0x%px\n", addr); + return r; + } + *(data + addrIt) = val; + addrIt++; + addr++; + } + // fall through + case 8: + for(int i = 0; i < 4; i++){ + if((r=get_user( val, (uint8_t __user *)addr))){ + printk("Failed to read data at 0x%px\n", addr); + return r; + } + *(data + addrIt) = val; + addrIt++; + addr++; + } + // fall through + case 4: + for(int i = 0; i < 2; i++){ + if((r=get_user( val, (uint8_t __user *)addr))){ + printk("Failed to read data at 0x%px\n", addr); + return r; + } + *(data + addrIt) = val; + addrIt++; + addr++; + } + // fall through + case 2: + if((r=get_user( val, (uint8_t __user *)addr))){ + printk("Failed to read data at 0x%px\n", addr); + return r; + } + *(data + addrIt) = val; + addrIt++; + addr++; + // fall through + case 1: + if((r=get_user( val, (uint8_t __user *)addr))){ + printk("Failed to read data at 0x%px\n", addr); + return r; + } + *(data + addrIt) = val; + addrIt++; + addr++; + break; + default: + printk("unsupported size %d\n", size); + } + + return r; +} + int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ int r; u64 data1[2] = {0,0}; u64 data2[2] = {0,0}; - + //desc->decodedtime = ktime_get_ns(); // the reg indices have to always be valid, even if the reg isn't being used if(!desc->load){ if(desc->simd){ @@ -568,25 +792,28 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ int addrIt = 0; for(int i = 0; i < bcount; i++){ if((r=put_user( (*(((uint8_t*)(data1)) + addrIt) & 0xff), (uint8_t __user *)addr))){ - printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); + printk("Failed to write data at 0x%px (%d)(base was 0x%px)\n", addr, r, desc->addr); return r; } //desc->data1 >>= 8; addrIt++; addr++; } - + //put_data2(bcount, (uint8_t*)data1, addr); + //addr += bcount; addrIt = 0; if(desc->pair){ for(int i = 0; i < bcount; i++){ if((r=put_user((*(((uint8_t*)(data2)) + addrIt) & 0xff) & 0xff, (uint8_t __user *)addr))){ - printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); + printk("Failed to write data at 0x%px (%d)(base was 0x%px)\n", addr, r, desc->addr); return r; } //desc->data2 >>= 8; addrIt++; addr++; } + //put_data2(bcount, (uint8_t*)data2, addr); + addr += bcount; } arm64_skip_faulting_instruction(regs, 4); } else { @@ -597,7 +824,7 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ //printk("Storing %d bytes (pair: %d) to 0x%llx",bcount, desc->pair, desc->addr); int addrIt = 0; - for(int i = 0; i < bcount; i++){ + /*for(int i = 0; i < bcount; i++){ uint8_t val; if((r=get_user( val, (uint8_t __user *)addr))){ printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); @@ -607,7 +834,9 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ //desc->data1 >>= 8; addrIt++; addr++; - } + }*/ + get_data2(bcount, (uint8_t*)data1, addr); + addr += bcount; if(desc->simd){ write_simd_reg(desc->reg1, data1); @@ -617,7 +846,7 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ addrIt = 0; if(desc->pair){ - for(int i = 0; i < bcount; i++){ + /*for(int i = 0; i < bcount; i++){ uint8_t val; if((r=get_user(val, (uint8_t __user *)addr))){ printk("Failed to write data at 0x%px (base was 0x%px)\n", addr, desc->addr); @@ -627,8 +856,10 @@ int do_ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ //desc->data2 >>= 8; addrIt++; addr++; - } + }*/ + get_data2(bcount, (uint8_t*)data2, addr); + addr += bcount; if(desc->simd){ write_simd_reg(desc->reg2, data1); } else { @@ -713,7 +944,7 @@ int ls_cas_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc) return 0; } -int ls_pair_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ +__attribute__((always_inline)) inline int ls_pair_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ uint8_t op2; uint8_t opc; op2 = (instr >> 23) & 3; @@ -727,15 +958,16 @@ int ls_pair_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc uint8_t Rt = instr & 0x1f; int64_t imm = extend_sign(imm7, 7); - int immshift = 0; + //int immshift = 0; desc->load = load; desc->simd = simd; // opc controls the width if(simd){ desc->width = 32 << opc; - immshift = 4 << opc; - imm <<= immshift; + //immshift = 4 << opc; + imm <<= 2; + imm <<= opc; } else { switch(opc){ case 0: @@ -769,7 +1001,7 @@ int ls_pair_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc } -int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ +__attribute__((always_inline)) inline int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ uint8_t size = (instr >> 30) & 3; uint8_t simd = (instr >> 26) & 1; uint8_t opc = (instr >> 22) & 3; @@ -809,7 +1041,7 @@ int ls_reg_unsigned_imm(u32 instr, struct pt_regs *regs, struct fixupDescription } -u64 extend_reg(u64 reg, int type, int shift){ +__attribute__((always_inline)) inline u64 extend_reg(u64 reg, int type, int shift){ uint8_t is_signed = (type & 4) >> 2; uint8_t input_width = type & 1; @@ -835,7 +1067,7 @@ u64 extend_reg(u64 reg, int type, int shift){ return tmp << shift; } -int lsr_offset_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ +__attribute__((always_inline)) inline int lsr_offset_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ uint8_t size = (instr >> 30) & 3; uint8_t simd = (instr >> 26) & 1; uint8_t opc = (instr >> 22) & 3; @@ -893,7 +1125,7 @@ int lsr_offset_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* d return 0; } -int lsr_unscaled_immediate_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ +__attribute__((always_inline)) inline int lsr_unscaled_immediate_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ uint8_t size = (instr >> 30) & 3; uint8_t simd = (instr >> 26) & 1; uint8_t opc = (instr >> 22) & 3; @@ -936,7 +1168,7 @@ int lsr_unscaled_immediate_fixup(u32 instr, struct pt_regs *regs, struct fixupDe return 1; } -int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ +__attribute__((always_inline)) inline int ls_fixup(u32 instr, struct pt_regs *regs, struct fixupDescription* desc){ uint8_t op0; uint8_t op1; uint8_t op2; @@ -1024,19 +1256,24 @@ int do_alignment_fixup(unsigned long addr, struct pt_regs *regs){ * */ - instrDBG(instr); + //instrDBG(instr); uint8_t op0; int r; struct fixupDescription desc = {0}; - + //desc.starttime = ktime_get_ns(); op0 = ((instr & 0x1E000000) >> 25); if((op0 & 5) == 0x4){ //printk("Load/Store\n"); r = ls_fixup(instr, regs, &desc); + //desc.endtime = ktime_get_ns(); + /*printk("Trap timing: decoding: %ldns, mem ops: %ldns, total: %ldns\n", desc.decodedtime - desc.starttime, + desc.endtime - desc.decodedtime, desc.endtime - desc.starttime); + */ if(r){ printk("Faulting instruction: 0x%lx\n", instr); } + return r; } else { printk("Not handling instruction with op0 0x%x ",op0);