Skip to content

Commit 520133b

Browse files
committed
Merge tag 'amd-drm-fixes-6.18-2025-10-16' of https://gitlab.freedesktop.org/agd5f/linux into drm-fixes
amd-drm-fixes-6.18-2025-10-16: amdgpu: - Backlight fix - SI fixes - CIK fix - Make CE support debug only - IP discovery fix - Ring reset fixes - GPUVM fault memory barrier fix - Drop unused structures in amdgpu_drm.h - JPEG debugfs fix - VRAM handling fixes for GPUs without VRAM - GC 12 MES fixes amdkfd: - MES fix Signed-off-by: Dave Airlie <airlied@redhat.com> From: Alex Deucher <alexander.deucher@amd.com> Link: https://lore.kernel.org/r/20251016132224.2534946-1-alexander.deucher@amd.com
2 parents f69f31e + 079ae51 commit 520133b

28 files changed

+166
-144
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1290,6 +1290,7 @@ struct amdgpu_device {
12901290
bool debug_disable_gpu_ring_reset;
12911291
bool debug_vm_userptr;
12921292
bool debug_disable_ce_logs;
1293+
bool debug_enable_ce_cs;
12931294

12941295
/* Protection for the following isolation structure */
12951296
struct mutex enforce_isolation_mutex;

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2329,10 +2329,9 @@ void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem)
23292329
int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev,
23302330
struct kfd_vm_fault_info *mem)
23312331
{
2332-
if (atomic_read(&adev->gmc.vm_fault_info_updated) == 1) {
2332+
if (atomic_read_acquire(&adev->gmc.vm_fault_info_updated) == 1) {
23332333
*mem = *adev->gmc.vm_fault_info;
2334-
mb(); /* make sure read happened */
2335-
atomic_set(&adev->gmc.vm_fault_info_updated, 0);
2334+
atomic_set_release(&adev->gmc.vm_fault_info_updated, 0);
23362335
}
23372336
return 0;
23382337
}

drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,12 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
364364
if (p->uf_bo && ring->funcs->no_user_fence)
365365
return -EINVAL;
366366

367+
if (!p->adev->debug_enable_ce_cs &&
368+
chunk_ib->flags & AMDGPU_IB_FLAG_CE) {
369+
dev_err_ratelimited(p->adev->dev, "CE CS is blocked, use debug=0x400 to override\n");
370+
return -EINVAL;
371+
}
372+
367373
if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX &&
368374
chunk_ib->flags & AMDGPU_IB_FLAG_PREEMPT) {
369375
if (chunk_ib->flags & AMDGPU_IB_FLAG_CE)
@@ -702,7 +708,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev,
702708
*/
703709
const s64 us_upper_bound = 200000;
704710

705-
if (!adev->mm_stats.log2_max_MBps) {
711+
if ((!adev->mm_stats.log2_max_MBps) || !ttm_resource_manager_used(&adev->mman.vram_mgr.manager)) {
706712
*max_bytes = 0;
707713
*max_vis_bytes = 0;
708714
return;

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1882,6 +1882,13 @@ static bool amdgpu_device_pcie_dynamic_switching_supported(struct amdgpu_device
18821882

18831883
static bool amdgpu_device_aspm_support_quirk(struct amdgpu_device *adev)
18841884
{
1885+
/* Enabling ASPM causes randoms hangs on Tahiti and Oland on Zen4.
1886+
* It's unclear if this is a platform-specific or GPU-specific issue.
1887+
* Disable ASPM on SI for the time being.
1888+
*/
1889+
if (adev->family == AMDGPU_FAMILY_SI)
1890+
return true;
1891+
18851892
#if IS_ENABLED(CONFIG_X86)
18861893
struct cpuinfo_x86 *c = &cpu_data(0);
18871894

drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1033,7 +1033,9 @@ static uint8_t amdgpu_discovery_get_harvest_info(struct amdgpu_device *adev,
10331033
/* Until a uniform way is figured, get mask based on hwid */
10341034
switch (hw_id) {
10351035
case VCN_HWID:
1036-
harvest = ((1 << inst) & adev->vcn.inst_mask) == 0;
1036+
/* VCN vs UVD+VCE */
1037+
if (!amdgpu_ip_version(adev, VCE_HWIP, 0))
1038+
harvest = ((1 << inst) & adev->vcn.inst_mask) == 0;
10371039
break;
10381040
case DMU_HWID:
10391041
if (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK)
@@ -2565,7 +2567,9 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
25652567
amdgpu_discovery_init(adev);
25662568
vega10_reg_base_init(adev);
25672569
adev->sdma.num_instances = 2;
2570+
adev->sdma.sdma_mask = 3;
25682571
adev->gmc.num_umc = 4;
2572+
adev->gfx.xcc_mask = 1;
25692573
adev->ip_versions[MMHUB_HWIP][0] = IP_VERSION(9, 0, 0);
25702574
adev->ip_versions[ATHUB_HWIP][0] = IP_VERSION(9, 0, 0);
25712575
adev->ip_versions[OSSSYS_HWIP][0] = IP_VERSION(4, 0, 0);
@@ -2592,7 +2596,9 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
25922596
amdgpu_discovery_init(adev);
25932597
vega10_reg_base_init(adev);
25942598
adev->sdma.num_instances = 2;
2599+
adev->sdma.sdma_mask = 3;
25952600
adev->gmc.num_umc = 4;
2601+
adev->gfx.xcc_mask = 1;
25962602
adev->ip_versions[MMHUB_HWIP][0] = IP_VERSION(9, 3, 0);
25972603
adev->ip_versions[ATHUB_HWIP][0] = IP_VERSION(9, 3, 0);
25982604
adev->ip_versions[OSSSYS_HWIP][0] = IP_VERSION(4, 0, 1);
@@ -2619,8 +2625,10 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
26192625
amdgpu_discovery_init(adev);
26202626
vega10_reg_base_init(adev);
26212627
adev->sdma.num_instances = 1;
2628+
adev->sdma.sdma_mask = 1;
26222629
adev->vcn.num_vcn_inst = 1;
26232630
adev->gmc.num_umc = 2;
2631+
adev->gfx.xcc_mask = 1;
26242632
if (adev->apu_flags & AMD_APU_IS_RAVEN2) {
26252633
adev->ip_versions[MMHUB_HWIP][0] = IP_VERSION(9, 2, 0);
26262634
adev->ip_versions[ATHUB_HWIP][0] = IP_VERSION(9, 2, 0);
@@ -2665,7 +2673,9 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
26652673
amdgpu_discovery_init(adev);
26662674
vega20_reg_base_init(adev);
26672675
adev->sdma.num_instances = 2;
2676+
adev->sdma.sdma_mask = 3;
26682677
adev->gmc.num_umc = 8;
2678+
adev->gfx.xcc_mask = 1;
26692679
adev->ip_versions[MMHUB_HWIP][0] = IP_VERSION(9, 4, 0);
26702680
adev->ip_versions[ATHUB_HWIP][0] = IP_VERSION(9, 4, 0);
26712681
adev->ip_versions[OSSSYS_HWIP][0] = IP_VERSION(4, 2, 0);
@@ -2693,8 +2703,10 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
26932703
amdgpu_discovery_init(adev);
26942704
arct_reg_base_init(adev);
26952705
adev->sdma.num_instances = 8;
2706+
adev->sdma.sdma_mask = 0xff;
26962707
adev->vcn.num_vcn_inst = 2;
26972708
adev->gmc.num_umc = 8;
2709+
adev->gfx.xcc_mask = 1;
26982710
adev->ip_versions[MMHUB_HWIP][0] = IP_VERSION(9, 4, 1);
26992711
adev->ip_versions[ATHUB_HWIP][0] = IP_VERSION(9, 4, 1);
27002712
adev->ip_versions[OSSSYS_HWIP][0] = IP_VERSION(4, 2, 1);
@@ -2726,8 +2738,10 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
27262738
amdgpu_discovery_init(adev);
27272739
aldebaran_reg_base_init(adev);
27282740
adev->sdma.num_instances = 5;
2741+
adev->sdma.sdma_mask = 0x1f;
27292742
adev->vcn.num_vcn_inst = 2;
27302743
adev->gmc.num_umc = 4;
2744+
adev->gfx.xcc_mask = 1;
27312745
adev->ip_versions[MMHUB_HWIP][0] = IP_VERSION(9, 4, 2);
27322746
adev->ip_versions[ATHUB_HWIP][0] = IP_VERSION(9, 4, 2);
27332747
adev->ip_versions[OSSSYS_HWIP][0] = IP_VERSION(4, 4, 0);
@@ -2762,6 +2776,8 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device *adev)
27622776
} else {
27632777
cyan_skillfish_reg_base_init(adev);
27642778
adev->sdma.num_instances = 2;
2779+
adev->sdma.sdma_mask = 3;
2780+
adev->gfx.xcc_mask = 1;
27652781
adev->ip_versions[MMHUB_HWIP][0] = IP_VERSION(2, 0, 3);
27662782
adev->ip_versions[ATHUB_HWIP][0] = IP_VERSION(2, 0, 3);
27672783
adev->ip_versions[OSSSYS_HWIP][0] = IP_VERSION(5, 0, 1);

drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,8 @@ enum AMDGPU_DEBUG_MASK {
144144
AMDGPU_DEBUG_DISABLE_GPU_RING_RESET = BIT(6),
145145
AMDGPU_DEBUG_SMU_POOL = BIT(7),
146146
AMDGPU_DEBUG_VM_USERPTR = BIT(8),
147-
AMDGPU_DEBUG_DISABLE_RAS_CE_LOG = BIT(9)
147+
AMDGPU_DEBUG_DISABLE_RAS_CE_LOG = BIT(9),
148+
AMDGPU_DEBUG_ENABLE_CE_CS = BIT(10)
148149
};
149150

150151
unsigned int amdgpu_vram_limit = UINT_MAX;
@@ -2289,6 +2290,11 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
22892290
pr_info("debug: disable kernel logs of correctable errors\n");
22902291
adev->debug_disable_ce_logs = true;
22912292
}
2293+
2294+
if (amdgpu_debug_mask & AMDGPU_DEBUG_ENABLE_CE_CS) {
2295+
pr_info("debug: allowing command submission to CE engine\n");
2296+
adev->debug_enable_ce_cs = true;
2297+
}
22922298
}
22932299

22942300
static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long flags)

drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -758,11 +758,42 @@ void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring)
758758
* @fence: fence of the ring to signal
759759
*
760760
*/
761-
void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *fence)
761+
void amdgpu_fence_driver_guilty_force_completion(struct amdgpu_fence *af)
762762
{
763-
dma_fence_set_error(&fence->base, -ETIME);
764-
amdgpu_fence_write(fence->ring, fence->seq);
765-
amdgpu_fence_process(fence->ring);
763+
struct dma_fence *unprocessed;
764+
struct dma_fence __rcu **ptr;
765+
struct amdgpu_fence *fence;
766+
struct amdgpu_ring *ring = af->ring;
767+
unsigned long flags;
768+
u32 seq, last_seq;
769+
770+
last_seq = amdgpu_fence_read(ring) & ring->fence_drv.num_fences_mask;
771+
seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask;
772+
773+
/* mark all fences from the guilty context with an error */
774+
spin_lock_irqsave(&ring->fence_drv.lock, flags);
775+
do {
776+
last_seq++;
777+
last_seq &= ring->fence_drv.num_fences_mask;
778+
779+
ptr = &ring->fence_drv.fences[last_seq];
780+
rcu_read_lock();
781+
unprocessed = rcu_dereference(*ptr);
782+
783+
if (unprocessed && !dma_fence_is_signaled_locked(unprocessed)) {
784+
fence = container_of(unprocessed, struct amdgpu_fence, base);
785+
786+
if (fence == af)
787+
dma_fence_set_error(&fence->base, -ETIME);
788+
else if (fence->context == af->context)
789+
dma_fence_set_error(&fence->base, -ECANCELED);
790+
}
791+
rcu_read_unlock();
792+
} while (last_seq != seq);
793+
spin_unlock_irqrestore(&ring->fence_drv.lock, flags);
794+
/* signal the guilty fence */
795+
amdgpu_fence_write(ring, af->seq);
796+
amdgpu_fence_process(ring);
766797
}
767798

768799
void amdgpu_fence_save_wptr(struct dma_fence *fence)
@@ -790,14 +821,19 @@ void amdgpu_ring_backup_unprocessed_commands(struct amdgpu_ring *ring,
790821
struct dma_fence *unprocessed;
791822
struct dma_fence __rcu **ptr;
792823
struct amdgpu_fence *fence;
793-
u64 wptr, i, seqno;
824+
u64 wptr;
825+
u32 seq, last_seq;
794826

795-
seqno = amdgpu_fence_read(ring);
827+
last_seq = amdgpu_fence_read(ring) & ring->fence_drv.num_fences_mask;
828+
seq = ring->fence_drv.sync_seq & ring->fence_drv.num_fences_mask;
796829
wptr = ring->fence_drv.signalled_wptr;
797830
ring->ring_backup_entries_to_copy = 0;
798831

799-
for (i = seqno + 1; i <= ring->fence_drv.sync_seq; ++i) {
800-
ptr = &ring->fence_drv.fences[i & ring->fence_drv.num_fences_mask];
832+
do {
833+
last_seq++;
834+
last_seq &= ring->fence_drv.num_fences_mask;
835+
836+
ptr = &ring->fence_drv.fences[last_seq];
801837
rcu_read_lock();
802838
unprocessed = rcu_dereference(*ptr);
803839

@@ -813,7 +849,7 @@ void amdgpu_ring_backup_unprocessed_commands(struct amdgpu_ring *ring,
813849
wptr = fence->wptr;
814850
}
815851
rcu_read_unlock();
816-
}
852+
} while (last_seq != seq);
817853
}
818854

819855
/*

drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,7 @@ static int amdgpu_debugfs_jpeg_sched_mask_set(void *data, u64 val)
371371
for (i = 0; i < adev->jpeg.num_jpeg_inst; ++i) {
372372
for (j = 0; j < adev->jpeg.num_jpeg_rings; ++j) {
373373
ring = &adev->jpeg.inst[i].ring_dec[j];
374-
if (val & (BIT_ULL(1) << ((i * adev->jpeg.num_jpeg_rings) + j)))
374+
if (val & (BIT_ULL((i * adev->jpeg.num_jpeg_rings) + j)))
375375
ring->sched.ready = true;
376376
else
377377
ring->sched.ready = false;

drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -758,7 +758,8 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
758758
ui64 = atomic64_read(&adev->num_vram_cpu_page_faults);
759759
return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
760760
case AMDGPU_INFO_VRAM_USAGE:
761-
ui64 = ttm_resource_manager_usage(&adev->mman.vram_mgr.manager);
761+
ui64 = ttm_resource_manager_used(&adev->mman.vram_mgr.manager) ?
762+
ttm_resource_manager_usage(&adev->mman.vram_mgr.manager) : 0;
762763
return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
763764
case AMDGPU_INFO_VIS_VRAM_USAGE:
764765
ui64 = amdgpu_vram_mgr_vis_usage(&adev->mman.vram_mgr);
@@ -804,8 +805,8 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
804805
mem.vram.usable_heap_size = adev->gmc.real_vram_size -
805806
atomic64_read(&adev->vram_pin_size) -
806807
AMDGPU_VM_RESERVED_VRAM;
807-
mem.vram.heap_usage =
808-
ttm_resource_manager_usage(vram_man);
808+
mem.vram.heap_usage = ttm_resource_manager_used(&adev->mman.vram_mgr.manager) ?
809+
ttm_resource_manager_usage(vram_man) : 0;
809810
mem.vram.max_allocation = mem.vram.usable_heap_size * 3 / 4;
810811

811812
mem.cpu_accessible_vram.total_heap_size =

drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
409409
return -EINVAL;
410410

411411
/* Clear the doorbell array before detection */
412-
memset(adev->mes.hung_queue_db_array_cpu_addr, 0,
412+
memset(adev->mes.hung_queue_db_array_cpu_addr, AMDGPU_MES_INVALID_DB_OFFSET,
413413
adev->mes.hung_queue_db_array_size * sizeof(u32));
414414
input.queue_type = queue_type;
415415
input.detect_only = detect_only;
@@ -420,12 +420,17 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
420420
dev_err(adev->dev, "failed to detect and reset\n");
421421
} else {
422422
*hung_db_num = 0;
423-
for (i = 0; i < adev->mes.hung_queue_db_array_size; i++) {
423+
for (i = 0; i < adev->mes.hung_queue_hqd_info_offset; i++) {
424424
if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
425425
hung_db_array[i] = db_array[i];
426426
*hung_db_num += 1;
427427
}
428428
}
429+
430+
/*
431+
* TODO: return HQD info for MES scheduled user compute queue reset cases
432+
* stored in hung_db_array hqd info offset to full array size
433+
*/
429434
}
430435

431436
return r;
@@ -686,14 +691,11 @@ int amdgpu_mes_init_microcode(struct amdgpu_device *adev, int pipe)
686691
bool amdgpu_mes_suspend_resume_all_supported(struct amdgpu_device *adev)
687692
{
688693
uint32_t mes_rev = adev->mes.sched_version & AMDGPU_MES_VERSION_MASK;
689-
bool is_supported = false;
690-
691-
if (amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0) &&
692-
amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(12, 0, 0) &&
693-
mes_rev >= 0x63)
694-
is_supported = true;
695694

696-
return is_supported;
695+
return ((amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0) &&
696+
amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(12, 0, 0) &&
697+
mes_rev >= 0x63) ||
698+
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(12, 0, 0));
697699
}
698700

699701
/* Fix me -- node_id is used to identify the correct MES instances in the future */

0 commit comments

Comments
 (0)