Skip to content

Commit c3543ba

Browse files
npiggingregkh
authored andcommitted
powerpc/64s/hash: Make hash faults work in NMI context
[ Upstream commit 8b91cee ] Hash faults are not resoved in NMI context, instead causing the access to fail. This is done because perf interrupts can get backtraces including walking the user stack, and taking a hash fault on those could deadlock on the HPTE lock if the perf interrupt hits while the same HPTE lock is being held by the hash fault code. The user-access for the stack walking will notice the access failed and deal with that in the perf code. The reason to allow perf interrupts in is to better profile hash faults. The problem with this is any hash fault on a kernel access that happens in NMI context will crash, because kernel accesses must not fail. Hard lockups, system reset, machine checks that access vmalloc space including modules and including stack backtracing and symbol lookup in modules, per-cpu data, etc could all run into this problem. Fix this by disallowing perf interrupts in the hash fault code (the direct hash fault is covered by MSR[EE]=0 so the PMI disable just needs to extend to the preload case). This simplifies the tricky logic in hash faults and perf, at the cost of reduced profiling of hash faults. perf can still latch addresses when interrupts are disabled, it just won't get the stack trace at that point, so it would still find hot spots, just sometimes with confusing stack chains. An alternative could be to allow perf interrupts here but always do the slowpath stack walk if we are in nmi context, but that slows down all perf interrupt stack walking on hash though and it does not remove as much tricky code. Reported-by: Laurent Dufour <ldufour@linux.ibm.com> Signed-off-by: Nicholas Piggin <npiggin@gmail.com> Tested-by: Laurent Dufour <ldufour@linux.ibm.com> Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20220204035348.545435-1-npiggin@gmail.com Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent df46792 commit c3543ba

File tree

4 files changed

+10
-82
lines changed

4 files changed

+10
-82
lines changed

arch/powerpc/include/asm/interrupt.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -567,7 +567,7 @@ DECLARE_INTERRUPT_HANDLER_RAW(do_slb_fault);
567567
DECLARE_INTERRUPT_HANDLER(do_bad_slb_fault);
568568

569569
/* hash_utils.c */
570-
DECLARE_INTERRUPT_HANDLER_RAW(do_hash_fault);
570+
DECLARE_INTERRUPT_HANDLER(do_hash_fault);
571571

572572
/* fault.c */
573573
DECLARE_INTERRUPT_HANDLER(do_page_fault);

arch/powerpc/mm/book3s64/hash_utils.c

Lines changed: 8 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1522,8 +1522,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap,
15221522
}
15231523
EXPORT_SYMBOL_GPL(hash_page);
15241524

1525-
DECLARE_INTERRUPT_HANDLER(__do_hash_fault);
1526-
DEFINE_INTERRUPT_HANDLER(__do_hash_fault)
1525+
DEFINE_INTERRUPT_HANDLER(do_hash_fault)
15271526
{
15281527
unsigned long ea = regs->dar;
15291528
unsigned long dsisr = regs->dsisr;
@@ -1582,35 +1581,6 @@ DEFINE_INTERRUPT_HANDLER(__do_hash_fault)
15821581
}
15831582
}
15841583

1585-
/*
1586-
* The _RAW interrupt entry checks for the in_nmi() case before
1587-
* running the full handler.
1588-
*/
1589-
DEFINE_INTERRUPT_HANDLER_RAW(do_hash_fault)
1590-
{
1591-
/*
1592-
* If we are in an "NMI" (e.g., an interrupt when soft-disabled), then
1593-
* don't call hash_page, just fail the fault. This is required to
1594-
* prevent re-entrancy problems in the hash code, namely perf
1595-
* interrupts hitting while something holds H_PAGE_BUSY, and taking a
1596-
* hash fault. See the comment in hash_preload().
1597-
*
1598-
* We come here as a result of a DSI at a point where we don't want
1599-
* to call hash_page, such as when we are accessing memory (possibly
1600-
* user memory) inside a PMU interrupt that occurred while interrupts
1601-
* were soft-disabled. We want to invoke the exception handler for
1602-
* the access, or panic if there isn't a handler.
1603-
*/
1604-
if (unlikely(in_nmi())) {
1605-
do_bad_page_fault_segv(regs);
1606-
return 0;
1607-
}
1608-
1609-
__do_hash_fault(regs);
1610-
1611-
return 0;
1612-
}
1613-
16141584
#ifdef CONFIG_PPC_MM_SLICES
16151585
static bool should_hash_preload(struct mm_struct *mm, unsigned long ea)
16161586
{
@@ -1677,26 +1647,18 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea,
16771647
#endif /* CONFIG_PPC_64K_PAGES */
16781648

16791649
/*
1680-
* __hash_page_* must run with interrupts off, as it sets the
1681-
* H_PAGE_BUSY bit. It's possible for perf interrupts to hit at any
1682-
* time and may take a hash fault reading the user stack, see
1683-
* read_user_stack_slow() in the powerpc/perf code.
1684-
*
1685-
* If that takes a hash fault on the same page as we lock here, it
1686-
* will bail out when seeing H_PAGE_BUSY set, and retry the access
1687-
* leading to an infinite loop.
1650+
* __hash_page_* must run with interrupts off, including PMI interrupts
1651+
* off, as it sets the H_PAGE_BUSY bit.
16881652
*
1689-
* Disabling interrupts here does not prevent perf interrupts, but it
1690-
* will prevent them taking hash faults (see the NMI test in
1691-
* do_hash_page), then read_user_stack's copy_from_user_nofault will
1692-
* fail and perf will fall back to read_user_stack_slow(), which
1693-
* walks the Linux page tables.
1653+
* It's otherwise possible for perf interrupts to hit at any time and
1654+
* may take a hash fault reading the user stack, which could take a
1655+
* hash miss and deadlock on the same H_PAGE_BUSY bit.
16941656
*
16951657
* Interrupts must also be off for the duration of the
16961658
* mm_is_thread_local test and update, to prevent preempt running the
16971659
* mm on another CPU (XXX: this may be racy vs kthread_use_mm).
16981660
*/
1699-
local_irq_save(flags);
1661+
powerpc_local_irq_pmu_save(flags);
17001662

17011663
/* Is that local to this CPU ? */
17021664
if (mm_is_thread_local(mm))
@@ -1721,7 +1683,7 @@ static void hash_preload(struct mm_struct *mm, pte_t *ptep, unsigned long ea,
17211683
mm_ctx_user_psize(&mm->context),
17221684
pte_val(*ptep));
17231685

1724-
local_irq_restore(flags);
1686+
powerpc_local_irq_pmu_restore(flags);
17251687
}
17261688

17271689
/*

arch/powerpc/perf/callchain.h

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
#ifndef _POWERPC_PERF_CALLCHAIN_H
33
#define _POWERPC_PERF_CALLCHAIN_H
44

5-
int read_user_stack_slow(const void __user *ptr, void *buf, int nb);
65
void perf_callchain_user_64(struct perf_callchain_entry_ctx *entry,
76
struct pt_regs *regs);
87
void perf_callchain_user_32(struct perf_callchain_entry_ctx *entry,
@@ -26,17 +25,11 @@ static inline int __read_user_stack(const void __user *ptr, void *ret,
2625
size_t size)
2726
{
2827
unsigned long addr = (unsigned long)ptr;
29-
int rc;
3028

3129
if (addr > TASK_SIZE - size || (addr & (size - 1)))
3230
return -EFAULT;
3331

34-
rc = copy_from_user_nofault(ret, ptr, size);
35-
36-
if (IS_ENABLED(CONFIG_PPC64) && !radix_enabled() && rc)
37-
return read_user_stack_slow(ptr, ret, size);
38-
39-
return rc;
32+
return copy_from_user_nofault(ret, ptr, size);
4033
}
4134

4235
#endif /* _POWERPC_PERF_CALLCHAIN_H */

arch/powerpc/perf/callchain_64.c

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -18,33 +18,6 @@
1818

1919
#include "callchain.h"
2020

21-
/*
22-
* On 64-bit we don't want to invoke hash_page on user addresses from
23-
* interrupt context, so if the access faults, we read the page tables
24-
* to find which page (if any) is mapped and access it directly. Radix
25-
* has no need for this so it doesn't use read_user_stack_slow.
26-
*/
27-
int read_user_stack_slow(const void __user *ptr, void *buf, int nb)
28-
{
29-
30-
unsigned long addr = (unsigned long) ptr;
31-
unsigned long offset;
32-
struct page *page;
33-
void *kaddr;
34-
35-
if (get_user_page_fast_only(addr, FOLL_WRITE, &page)) {
36-
kaddr = page_address(page);
37-
38-
/* align address to page boundary */
39-
offset = addr & ~PAGE_MASK;
40-
41-
memcpy(buf, kaddr + offset, nb);
42-
put_page(page);
43-
return 0;
44-
}
45-
return -EFAULT;
46-
}
47-
4821
static int read_user_stack_64(const unsigned long __user *ptr, unsigned long *ret)
4922
{
5023
return __read_user_stack(ptr, ret, sizeof(*ret));

0 commit comments

Comments
 (0)