Skip to content

Commit 42421eb

Browse files
committed
Merge: mm/hugetlb: fixes for split races
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-10/-/merge_requests/1138 JIRA: https://issues.redhat.com/browse/RHEL-101261 JIRA: https://issues.redhat.com/browse/RHEL-101296 CVE: CVE-2025-38084 CVE: CVE-2025-38085 Currently, __split_vma() triggers hugetlb page table unsharing through vm_ops->may_split(). This happens before the VMA lock and rmap locks are taken - which is too early, it allows racing VMA-locked page faults in our process and racing rmap walks from other processes to cause page tables to be shared again before we actually perform the split. Signed-off-by: Rafael Aquini <raquini@redhat.com> Approved-by: Herton R. Krzesinski <herton@redhat.com> Approved-by: Waiman Long <longman@redhat.com> Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com> Merged-by: Julio Faracco <jfaracco@redhat.com>
2 parents cfbce54 + 41f7eb5 commit 42421eb

File tree

5 files changed

+90
-20
lines changed

5 files changed

+90
-20
lines changed

include/linux/hugetlb.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,8 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
272272
bool is_hugetlb_entry_migration(pte_t pte);
273273
bool is_hugetlb_entry_hwpoisoned(pte_t pte);
274274
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
275+
void fixup_hugetlb_reservations(struct vm_area_struct *vma);
276+
void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
275277

276278
#else /* !CONFIG_HUGETLB_PAGE */
277279

@@ -465,6 +467,12 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,
465467

466468
static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }
467469

470+
static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
471+
{
472+
}
473+
474+
static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}
475+
468476
#endif /* !CONFIG_HUGETLB_PAGE */
469477

470478
#ifndef pgd_write

mm/hugetlb.c

Lines changed: 66 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
8787
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
8888
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
8989
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
90-
unsigned long start, unsigned long end);
90+
unsigned long start, unsigned long end, bool take_locks);
9191
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
9292

9393
static void hugetlb_free_folio(struct folio *folio)
@@ -1218,7 +1218,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma)
12181218
/*
12191219
* Reset and decrement one ref on hugepage private reservation.
12201220
* Called with mm->mmap_lock writer semaphore held.
1221-
* This function should be only used by move_vma() and operate on
1221+
* This function should be only used by mremap and operate on
12221222
* same sized vma. It should never come here with last ref on the
12231223
* reservation.
12241224
*/
@@ -5130,26 +5130,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
51305130
{
51315131
if (addr & ~(huge_page_mask(hstate_vma(vma))))
51325132
return -EINVAL;
5133+
return 0;
5134+
}
51335135

5136+
void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
5137+
{
51345138
/*
51355139
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
51365140
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
51375141
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
5142+
* This function is called in the middle of a VMA split operation, with
5143+
* MM, VMA and rmap all write-locked to prevent concurrent page table
5144+
* walks (except hardware and gup_fast()).
51385145
*/
5146+
vma_assert_write_locked(vma);
5147+
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
5148+
51395149
if (addr & ~PUD_MASK) {
5140-
/*
5141-
* hugetlb_vm_op_split is called right before we attempt to
5142-
* split the VMA. We will need to unshare PMDs in the old and
5143-
* new VMAs, so let's unshare before we split.
5144-
*/
51455150
unsigned long floor = addr & PUD_MASK;
51465151
unsigned long ceil = floor + PUD_SIZE;
51475152

5148-
if (floor >= vma->vm_start && ceil <= vma->vm_end)
5149-
hugetlb_unshare_pmds(vma, floor, ceil);
5153+
if (floor >= vma->vm_start && ceil <= vma->vm_end) {
5154+
/*
5155+
* Locking:
5156+
* Use take_locks=false here.
5157+
* The file rmap lock is already held.
5158+
* The hugetlb VMA lock can't be taken when we already
5159+
* hold the file rmap lock, and we don't need it because
5160+
* its purpose is to synchronize against concurrent page
5161+
* table walks, which are not possible thanks to the
5162+
* locks held by our caller.
5163+
*/
5164+
hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
5165+
}
51505166
}
5151-
5152-
return 0;
51535167
}
51545168

51555169
static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
@@ -7309,6 +7323,13 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
73097323
return 0;
73107324

73117325
pud_clear(pud);
7326+
/*
7327+
* Once our caller drops the rmap lock, some other process might be
7328+
* using this page table as a normal, non-hugetlb page table.
7329+
* Wait for pending gup_fast() in other threads to finish before letting
7330+
* that happen.
7331+
*/
7332+
tlb_remove_table_sync_one();
73127333
ptdesc_pmd_pts_dec(virt_to_ptdesc(ptep));
73137334
mm_dec_nr_pmds(mm);
73147335
return 1;
@@ -7541,9 +7562,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
75417562
}
75427563
}
75437564

7565+
/*
7566+
* If @take_locks is false, the caller must ensure that no concurrent page table
7567+
* access can happen (except for gup_fast() and hardware page walks).
7568+
* If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
7569+
* concurrent page fault handling) and the file rmap lock.
7570+
*/
75447571
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
75457572
unsigned long start,
7546-
unsigned long end)
7573+
unsigned long end,
7574+
bool take_locks)
75477575
{
75487576
struct hstate *h = hstate_vma(vma);
75497577
unsigned long sz = huge_page_size(h);
@@ -7567,8 +7595,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
75677595
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
75687596
start, end);
75697597
mmu_notifier_invalidate_range_start(&range);
7570-
hugetlb_vma_lock_write(vma);
7571-
i_mmap_lock_write(vma->vm_file->f_mapping);
7598+
if (take_locks) {
7599+
hugetlb_vma_lock_write(vma);
7600+
i_mmap_lock_write(vma->vm_file->f_mapping);
7601+
} else {
7602+
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
7603+
}
75727604
for (address = start; address < end; address += PUD_SIZE) {
75737605
ptep = hugetlb_walk(vma, address, sz);
75747606
if (!ptep)
@@ -7578,8 +7610,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
75787610
spin_unlock(ptl);
75797611
}
75807612
flush_hugetlb_tlb_range(vma, start, end);
7581-
i_mmap_unlock_write(vma->vm_file->f_mapping);
7582-
hugetlb_vma_unlock_write(vma);
7613+
if (take_locks) {
7614+
i_mmap_unlock_write(vma->vm_file->f_mapping);
7615+
hugetlb_vma_unlock_write(vma);
7616+
}
75837617
/*
75847618
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
75857619
* Documentation/mm/mmu_notifier.rst.
@@ -7594,7 +7628,22 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
75947628
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
75957629
{
75967630
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
7597-
ALIGN_DOWN(vma->vm_end, PUD_SIZE));
7631+
ALIGN_DOWN(vma->vm_end, PUD_SIZE),
7632+
/* take_locks = */ true);
7633+
}
7634+
7635+
/*
7636+
* For hugetlb, mremap() is an odd edge case - while the VMA copying is
7637+
* performed, we permit both the old and new VMAs to reference the same
7638+
* reservation.
7639+
*
7640+
* We fix this up after the operation succeeds, or if a newly allocated VMA
7641+
* is closed as a result of a failure to allocate memory.
7642+
*/
7643+
void fixup_hugetlb_reservations(struct vm_area_struct *vma)
7644+
{
7645+
if (is_vm_hugetlb_page(vma))
7646+
clear_vma_resv_huge_pages(vma);
75987647
}
75997648

76007649
#ifdef CONFIG_CMA

mm/mremap.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -782,9 +782,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
782782
mremap_userfaultfd_prep(new_vma, uf);
783783
}
784784

785-
if (is_vm_hugetlb_page(vma)) {
786-
clear_vma_resv_huge_pages(vma);
787-
}
785+
fixup_hugetlb_reservations(vma);
788786

789787
/* Conceal VM_ACCOUNT so old reservation is not undone */
790788
if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {

mm/vma.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,7 +467,14 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
467467
init_vma_prep(&vp, vma);
468468
vp.insert = new;
469469
vma_prepare(&vp);
470+
471+
/*
472+
* Get rid of huge pages and shared page tables straddling the split
473+
* boundary.
474+
*/
470475
vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
476+
if (is_vm_hugetlb_page(vma))
477+
hugetlb_split(vma, addr);
471478

472479
if (new_below) {
473480
vma->vm_start = addr;
@@ -1776,6 +1783,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
17761783
return new_vma;
17771784

17781785
out_vma_link:
1786+
fixup_hugetlb_reservations(new_vma);
17791787
vma_close(new_vma);
17801788

17811789
if (new_vma->vm_file)

tools/testing/vma/vma_internal.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -746,6 +746,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
746746
(void)adjust_next;
747747
}
748748

749+
static inline void hugetlb_split(struct vm_area_struct *, unsigned long) {}
750+
749751
static inline void vma_iter_free(struct vma_iterator *vmi)
750752
{
751753
mas_destroy(&vmi->mas);
@@ -1033,4 +1035,9 @@ static inline int mmap_file(struct file *, struct vm_area_struct *)
10331035
return 0;
10341036
}
10351037

1038+
static inline void fixup_hugetlb_reservations(struct vm_area_struct *vma)
1039+
{
1040+
(void)vma;
1041+
}
1042+
10361043
#endif /* __MM_VMA_INTERNAL_H */

0 commit comments

Comments
 (0)