Skip to content

Commit 92af98c

Browse files
committed
mm, bpf: Introduce free_pages_nolock()
JIRA: https://issues.redhat.com/browse/RHEL-78202 Conflicts: - conlicts due to missing mm series "Allocate and free frozen pages" resolved with reference to v6 of patch series commit 8c57b68 Author: Alexei Starovoitov <ast@kernel.org> Date: Fri Feb 21 18:44:24 2025 -0800 mm, bpf: Introduce free_pages_nolock() Introduce free_pages_nolock() that can free pages without taking locks. It relies on trylock and can be called from any context. Since spin_trylock() cannot be used in PREEMPT_RT from hard IRQ or NMI it uses lockless link list to stash the pages which will be freed by subsequent free_pages() from good context. Do not use llist unconditionally. BPF maps continuously allocate/free, so we cannot unconditionally delay the freeing to llist. When the memory becomes free make it available to the kernel and BPF users right away if possible, and fallback to llist as the last resort. Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Link: https://lore.kernel.org/r/20250222024427.30294-4-alexei.starovoitov@gmail.com Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Gregory Bell <grbell@redhat.com>
1 parent dd0169f commit 92af98c

File tree

6 files changed

+100
-14
lines changed

6 files changed

+100
-14
lines changed

include/linux/gfp.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,7 @@ __meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mas
391391
__get_free_pages((gfp_mask) | GFP_DMA, (order))
392392

393393
extern void __free_pages(struct page *page, unsigned int order);
394+
extern void free_pages_nolock(struct page *page, unsigned int order);
394395
extern void free_pages(unsigned long addr, unsigned int order);
395396

396397
#define __free_page(page) __free_pages((page), 0)

include/linux/mm_types.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,10 @@ struct page {
9999
/* Or, free page */
100100
struct list_head buddy_list;
101101
struct list_head pcp_list;
102+
struct {
103+
struct llist_node pcp_llist;
104+
unsigned int order;
105+
};
102106
};
103107
/* See page-flags.h for PAGE_MAPPING_FLAGS */
104108
struct address_space *mapping;

include/linux/mmzone.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -982,6 +982,9 @@ struct zone {
982982
/* Primarily protects free_area */
983983
spinlock_t lock;
984984

985+
/* Pages to be freed when next trylock succeeds */
986+
struct llist_head trylock_free_pages;
987+
985988
/* Write-intensive fields used by compaction and vmstats. */
986989
CACHELINE_PADDING(_pad2_);
987990

lib/stackdepot.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -672,7 +672,10 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
672672
exit:
673673
if (prealloc) {
674674
/* Stack depot didn't use this memory, free it. */
675-
free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER);
675+
if (!allow_spin)
676+
free_pages_nolock(virt_to_page(prealloc), DEPOT_POOL_ORDER);
677+
else
678+
free_pages((unsigned long)prealloc, DEPOT_POOL_ORDER);
676679
}
677680
if (found)
678681
handle = found->handle.handle;

mm/page_alloc.c

Lines changed: 81 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,9 @@ typedef int __bitwise fpi_t;
8888
*/
8989
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
9090

91+
/* Free the page without taking locks. Rely on trylock only. */
92+
#define FPI_TRYLOCK ((__force fpi_t)BIT(2))
93+
9194
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
9295
static DEFINE_MUTEX(pcp_batch_high_lock);
9396
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
@@ -1245,13 +1248,44 @@ static void split_large_buddy(struct zone *zone, struct page *page,
12451248
} while (1);
12461249
}
12471250

1251+
static void add_page_to_zone_llist(struct zone *zone, struct page *page,
1252+
unsigned int order)
1253+
{
1254+
/* Remember the order */
1255+
page->order = order;
1256+
/* Add the page to the free list */
1257+
llist_add(&page->pcp_llist, &zone->trylock_free_pages);
1258+
}
1259+
12481260
static void free_one_page(struct zone *zone, struct page *page,
12491261
unsigned long pfn, unsigned int order,
12501262
fpi_t fpi_flags)
12511263
{
1264+
struct llist_head *llhead;
12521265
unsigned long flags;
12531266

1254-
spin_lock_irqsave(&zone->lock, flags);
1267+
if (!spin_trylock_irqsave(&zone->lock, flags)) {
1268+
if (unlikely(fpi_flags & FPI_TRYLOCK)) {
1269+
add_page_to_zone_llist(zone, page, order);
1270+
return;
1271+
}
1272+
spin_lock_irqsave(&zone->lock, flags);
1273+
}
1274+
1275+
/* The lock succeeded. Process deferred pages. */
1276+
llhead = &zone->trylock_free_pages;
1277+
if (unlikely(!llist_empty(llhead) && !(fpi_flags & FPI_TRYLOCK))) {
1278+
struct llist_node *llnode;
1279+
struct page *p, *tmp;
1280+
1281+
llnode = llist_del_all(llhead);
1282+
llist_for_each_entry_safe(p, tmp, llnode, pcp_llist) {
1283+
unsigned int p_order = p->order;
1284+
1285+
split_large_buddy(zone, p, page_to_pfn(p), p_order, fpi_flags);
1286+
__count_vm_events(PGFREE, 1 << p_order);
1287+
}
1288+
}
12551289
split_large_buddy(zone, page, pfn, order, fpi_flags);
12561290
spin_unlock_irqrestore(&zone->lock, flags);
12571291

@@ -2594,7 +2628,7 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
25942628

25952629
static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
25962630
struct page *page, int migratetype,
2597-
unsigned int order)
2631+
unsigned int order, fpi_t fpi_flags)
25982632
{
25992633
int high, batch;
26002634
int pindex;
@@ -2629,6 +2663,14 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
26292663
}
26302664
if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
26312665
pcp->free_count += (1 << order);
2666+
2667+
if (unlikely(fpi_flags & FPI_TRYLOCK)) {
2668+
/*
2669+
* Do not attempt to take a zone lock. Let pcp->count get
2670+
* over high mark temporarily.
2671+
*/
2672+
return;
2673+
}
26322674
high = nr_pcp_high(pcp, zone, batch, free_high);
26332675
if (pcp->count >= high) {
26342676
free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
@@ -2643,7 +2685,8 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
26432685
/*
26442686
* Free a pcp page
26452687
*/
2646-
void free_unref_page(struct page *page, unsigned int order)
2688+
static void __free_unref_page(struct page *page, unsigned int order,
2689+
fpi_t fpi_flags)
26472690
{
26482691
unsigned long __maybe_unused UP_flags;
26492692
struct per_cpu_pages *pcp;
@@ -2652,7 +2695,7 @@ void free_unref_page(struct page *page, unsigned int order)
26522695
int migratetype;
26532696

26542697
if (!pcp_allowed_order(order)) {
2655-
__free_pages_ok(page, order, FPI_NONE);
2698+
__free_pages_ok(page, order, fpi_flags);
26562699
return;
26572700
}
26582701

@@ -2669,24 +2712,34 @@ void free_unref_page(struct page *page, unsigned int order)
26692712
migratetype = get_pfnblock_migratetype(page, pfn);
26702713
if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
26712714
if (unlikely(is_migrate_isolate(migratetype))) {
2672-
free_one_page(page_zone(page), page, pfn, order, FPI_NONE);
2715+
free_one_page(page_zone(page), page, pfn, order, fpi_flags);
26732716
return;
26742717
}
26752718
migratetype = MIGRATE_MOVABLE;
26762719
}
26772720

26782721
zone = page_zone(page);
2722+
if (unlikely((fpi_flags & FPI_TRYLOCK) && IS_ENABLED(CONFIG_PREEMPT_RT)
2723+
&& (in_nmi() || in_hardirq()))) {
2724+
add_page_to_zone_llist(zone, page, order);
2725+
return;
2726+
}
26792727
pcp_trylock_prepare(UP_flags);
26802728
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
26812729
if (pcp) {
2682-
free_unref_page_commit(zone, pcp, page, migratetype, order);
2730+
free_unref_page_commit(zone, pcp, page, migratetype, order, fpi_flags);
26832731
pcp_spin_unlock(pcp);
26842732
} else {
2685-
free_one_page(zone, page, pfn, order, FPI_NONE);
2733+
free_one_page(zone, page, pfn, order, fpi_flags);
26862734
}
26872735
pcp_trylock_finish(UP_flags);
26882736
}
26892737

2738+
void free_unref_page(struct page *page, unsigned int order)
2739+
{
2740+
__free_unref_page(page, order, FPI_NONE);
2741+
}
2742+
26902743
/*
26912744
* Free a batch of folios
26922745
*/
@@ -2775,7 +2828,7 @@ void free_unref_folios(struct folio_batch *folios)
27752828

27762829
trace_mm_page_free_batched(&folio->page);
27772830
free_unref_page_commit(zone, pcp, &folio->page, migratetype,
2778-
order);
2831+
order, FPI_NONE);
27792832
}
27802833

27812834
if (pcp) {
@@ -4820,9 +4873,10 @@ unsigned long get_zeroed_page_noprof(gfp_t gfp_mask)
48204873
EXPORT_SYMBOL(get_zeroed_page_noprof);
48214874

48224875
/**
4823-
* __free_pages - Free pages allocated with alloc_pages().
4876+
* ___free_pages - Free pages allocated with alloc_pages().
48244877
* @page: The page pointer returned from alloc_pages().
48254878
* @order: The order of the allocation.
4879+
* @fpi_flags: Free Page Internal flags.
48264880
*
48274881
* This function can free multi-page allocations that are not compound
48284882
* pages. It does not check that the @order passed in matches that of
@@ -4839,22 +4893,37 @@ EXPORT_SYMBOL(get_zeroed_page_noprof);
48394893
* Context: May be called in interrupt context or while holding a normal
48404894
* spinlock, but not in NMI context or while holding a raw spinlock.
48414895
*/
4842-
void __free_pages(struct page *page, unsigned int order)
4896+
static void ___free_pages(struct page *page, unsigned int order,
4897+
fpi_t fpi_flags)
48434898
{
48444899
/* get PageHead before we drop reference */
48454900
int head = PageHead(page);
48464901
struct alloc_tag *tag = pgalloc_tag_get(page);
48474902

48484903
if (put_page_testzero(page))
4849-
free_unref_page(page, order);
4904+
__free_unref_page(page, order, fpi_flags);
48504905
else if (!head) {
48514906
pgalloc_tag_sub_pages(tag, (1 << order) - 1);
48524907
while (order-- > 0)
4853-
free_unref_page(page + (1 << order), order);
4908+
__free_unref_page(page + (1 << order), order,
4909+
fpi_flags);
48544910
}
48554911
}
4912+
void __free_pages(struct page *page, unsigned int order)
4913+
{
4914+
___free_pages(page, order, FPI_NONE);
4915+
}
48564916
EXPORT_SYMBOL(__free_pages);
48574917

4918+
/*
4919+
* Can be called while holding raw_spin_lock or from IRQ and NMI for any
4920+
* page type (not only those that came from try_alloc_pages)
4921+
*/
4922+
void free_pages_nolock(struct page *page, unsigned int order)
4923+
{
4924+
___free_pages(page, order, FPI_TRYLOCK);
4925+
}
4926+
48584927
void free_pages(unsigned long addr, unsigned int order)
48594928
{
48604929
if (addr != 0) {

mm/page_owner.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,7 +299,13 @@ void __reset_page_owner(struct page *page, unsigned short order)
299299
alloc_handle = page_owner->handle;
300300
page_ext_put(page_ext);
301301

302-
handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
302+
/*
303+
* Do not specify GFP_NOWAIT to make gfpflags_allow_spinning() == false
304+
* to prevent issues in stack_depot_save().
305+
* This is similar to try_alloc_pages() gfp flags, but only used
306+
* to signal stack_depot to avoid spin_locks.
307+
*/
308+
handle = save_stack(__GFP_NOWARN);
303309
__update_page_owner_free_handle(page, handle, order, current->pid,
304310
current->tgid, free_ts_nsec);
305311

0 commit comments

Comments
 (0)