Skip to content

Commit 286db24

Browse files
committed
hugetlb: force allocating surplus hugepages on mempolicy allowed nodes
JIRA: https://issues.redhat.com/browse/RHEL-38605 Tested: by reporter commit 003af99 Author: Aristeu Rozanski <aris@redhat.com> Date: Fri Jun 21 15:00:50 2024 -0400 hugetlb: force allocating surplus hugepages on mempolicy allowed nodes When trying to allocate a hugepage with no reserved ones free, it may be allowed in case a number of overcommit hugepages was configured (using /proc/sys/vm/nr_overcommit_hugepages) and that number wasn't reached. This allows for a behavior of having extra hugepages allocated dynamically, if there're resources for it. Some sysadmins even prefer not reserving any hugepages and setting a big number of overcommit hugepages. But while attempting to allocate overcommit hugepages in a multi node system (either NUMA or mempolicy/cpuset) said allocations might randomly fail even when there're resources available for the allocation. This happens due to allowed_mems_nr() only accounting for the number of free hugepages in the nodes the current process belongs to and the surplus hugepage allocation is done so it can be allocated in any node. In case one or more of the requested surplus hugepages are allocated in a different node, the whole allocation will fail due allowed_mems_nr() returning a lower value. So allocate surplus hugepages in one of the nodes the current process belongs to. Easy way to reproduce this issue is to use a 2+ NUMA nodes system: # echo 0 >/proc/sys/vm/nr_hugepages # echo 1 >/proc/sys/vm/nr_overcommit_hugepages # numactl -m0 ./tools/testing/selftests/mm/map_hugetlb 2 Repeating the execution of map_hugetlb test application will eventually fail when the hugepage ends up allocated in a different node. [aris@ruivo.org: v2] Link: https://lkml.kernel.org/r/20240701212343.GG844599@cathedrallabs.org Link: https://lkml.kernel.org/r/20240621190050.mhxwb65zn37doegp@redhat.com Signed-off-by: Aristeu Rozanski <aris@redhat.com> Cc: Muchun Song <muchun.song@linux.dev> Cc: Aristeu Rozanski <aris@ruivo.org> Cc: David Hildenbrand <david@redhat.com> Cc: Vishal Moola <vishal.moola@gmail.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Aristeu Rozanski <arozansk@redhat.com>
1 parent f115f77 commit 286db24

File tree

1 file changed

+28
-19
lines changed

1 file changed

+28
-19
lines changed

mm/hugetlb.c

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2580,6 +2580,23 @@ struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *v
25802580
return folio;
25812581
}
25822582

2583+
static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
2584+
{
2585+
#ifdef CONFIG_NUMA
2586+
struct mempolicy *mpol = get_task_policy(current);
2587+
2588+
/*
2589+
* Only enforce MPOL_BIND policy which overlaps with cpuset policy
2590+
* (from policy_nodemask) specifically for hugetlb case
2591+
*/
2592+
if (mpol->mode == MPOL_BIND &&
2593+
(apply_policy_zone(mpol, gfp_zone(gfp)) &&
2594+
cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
2595+
return &mpol->nodes;
2596+
#endif
2597+
return NULL;
2598+
}
2599+
25832600
/*
25842601
* Increase the hugetlb pool such that it can accommodate a reservation
25852602
* of size 'delta'.
@@ -2594,6 +2611,8 @@ static int gather_surplus_pages(struct hstate *h, long delta)
25942611
long i;
25952612
long needed, allocated;
25962613
bool alloc_ok = true;
2614+
int node;
2615+
nodemask_t *mbind_nodemask = policy_mbind_nodemask(htlb_alloc_mask(h));
25972616

25982617
lockdep_assert_held(&hugetlb_lock);
25992618
needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
@@ -2608,8 +2627,15 @@ static int gather_surplus_pages(struct hstate *h, long delta)
26082627
retry:
26092628
spin_unlock_irq(&hugetlb_lock);
26102629
for (i = 0; i < needed; i++) {
2611-
folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
2612-
NUMA_NO_NODE, NULL);
2630+
folio = NULL;
2631+
for_each_node_mask(node, cpuset_current_mems_allowed) {
2632+
if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) {
2633+
folio = alloc_surplus_hugetlb_folio(h, htlb_alloc_mask(h),
2634+
node, NULL);
2635+
if (folio)
2636+
break;
2637+
}
2638+
}
26132639
if (!folio) {
26142640
alloc_ok = false;
26152641
break;
@@ -4589,23 +4615,6 @@ static int __init default_hugepagesz_setup(char *s)
45894615
}
45904616
__setup("default_hugepagesz=", default_hugepagesz_setup);
45914617

4592-
static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
4593-
{
4594-
#ifdef CONFIG_NUMA
4595-
struct mempolicy *mpol = get_task_policy(current);
4596-
4597-
/*
4598-
* Only enforce MPOL_BIND policy which overlaps with cpuset policy
4599-
* (from policy_nodemask) specifically for hugetlb case
4600-
*/
4601-
if (mpol->mode == MPOL_BIND &&
4602-
(apply_policy_zone(mpol, gfp_zone(gfp)) &&
4603-
cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
4604-
return &mpol->nodes;
4605-
#endif
4606-
return NULL;
4607-
}
4608-
46094618
static unsigned int allowed_mems_nr(struct hstate *h)
46104619
{
46114620
int node;

0 commit comments

Comments
 (0)