@@ -87,7 +87,7 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
8787static void hugetlb_vma_lock_alloc (struct vm_area_struct * vma );
8888static void __hugetlb_vma_unlock_write_free (struct vm_area_struct * vma );
8989static void hugetlb_unshare_pmds (struct vm_area_struct * vma ,
90- unsigned long start , unsigned long end );
90+ unsigned long start , unsigned long end , bool take_locks );
9191static struct resv_map * vma_resv_map (struct vm_area_struct * vma );
9292
9393static void hugetlb_free_folio (struct folio * folio )
@@ -1218,7 +1218,7 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma)
12181218/*
12191219 * Reset and decrement one ref on hugepage private reservation.
12201220 * Called with mm->mmap_lock writer semaphore held.
1221- * This function should be only used by move_vma() and operate on
1221+ * This function should be only used by mremap and operate on
12221222 * same sized vma. It should never come here with last ref on the
12231223 * reservation.
12241224 */
@@ -5130,26 +5130,40 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
51305130{
51315131 if (addr & ~(huge_page_mask (hstate_vma (vma ))))
51325132 return - EINVAL ;
5133+ return 0 ;
5134+ }
51335135
5136+ void hugetlb_split (struct vm_area_struct * vma , unsigned long addr )
5137+ {
51345138 /*
51355139 * PMD sharing is only possible for PUD_SIZE-aligned address ranges
51365140 * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
51375141 * split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
5142+ * This function is called in the middle of a VMA split operation, with
5143+ * MM, VMA and rmap all write-locked to prevent concurrent page table
5144+ * walks (except hardware and gup_fast()).
51385145 */
5146+ vma_assert_write_locked (vma );
5147+ i_mmap_assert_write_locked (vma -> vm_file -> f_mapping );
5148+
51395149 if (addr & ~PUD_MASK ) {
5140- /*
5141- * hugetlb_vm_op_split is called right before we attempt to
5142- * split the VMA. We will need to unshare PMDs in the old and
5143- * new VMAs, so let's unshare before we split.
5144- */
51455150 unsigned long floor = addr & PUD_MASK ;
51465151 unsigned long ceil = floor + PUD_SIZE ;
51475152
5148- if (floor >= vma -> vm_start && ceil <= vma -> vm_end )
5149- hugetlb_unshare_pmds (vma , floor , ceil );
5153+ if (floor >= vma -> vm_start && ceil <= vma -> vm_end ) {
5154+ /*
5155+ * Locking:
5156+ * Use take_locks=false here.
5157+ * The file rmap lock is already held.
5158+ * The hugetlb VMA lock can't be taken when we already
5159+ * hold the file rmap lock, and we don't need it because
5160+ * its purpose is to synchronize against concurrent page
5161+ * table walks, which are not possible thanks to the
5162+ * locks held by our caller.
5163+ */
5164+ hugetlb_unshare_pmds (vma , floor , ceil , /* take_locks = */ false);
5165+ }
51505166 }
5151-
5152- return 0 ;
51535167}
51545168
51555169static unsigned long hugetlb_vm_op_pagesize (struct vm_area_struct * vma )
@@ -7309,6 +7323,13 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
73097323 return 0 ;
73107324
73117325 pud_clear (pud );
7326+ /*
7327+ * Once our caller drops the rmap lock, some other process might be
7328+ * using this page table as a normal, non-hugetlb page table.
7329+ * Wait for pending gup_fast() in other threads to finish before letting
7330+ * that happen.
7331+ */
7332+ tlb_remove_table_sync_one ();
73127333 ptdesc_pmd_pts_dec (virt_to_ptdesc (ptep ));
73137334 mm_dec_nr_pmds (mm );
73147335 return 1 ;
@@ -7541,9 +7562,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
75417562 }
75427563}
75437564
7565+ /*
7566+ * If @take_locks is false, the caller must ensure that no concurrent page table
7567+ * access can happen (except for gup_fast() and hardware page walks).
7568+ * If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
7569+ * concurrent page fault handling) and the file rmap lock.
7570+ */
75447571static void hugetlb_unshare_pmds (struct vm_area_struct * vma ,
75457572 unsigned long start ,
7546- unsigned long end )
7573+ unsigned long end ,
7574+ bool take_locks )
75477575{
75487576 struct hstate * h = hstate_vma (vma );
75497577 unsigned long sz = huge_page_size (h );
@@ -7567,8 +7595,12 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
75677595 mmu_notifier_range_init (& range , MMU_NOTIFY_CLEAR , 0 , mm ,
75687596 start , end );
75697597 mmu_notifier_invalidate_range_start (& range );
7570- hugetlb_vma_lock_write (vma );
7571- i_mmap_lock_write (vma -> vm_file -> f_mapping );
7598+ if (take_locks ) {
7599+ hugetlb_vma_lock_write (vma );
7600+ i_mmap_lock_write (vma -> vm_file -> f_mapping );
7601+ } else {
7602+ i_mmap_assert_write_locked (vma -> vm_file -> f_mapping );
7603+ }
75727604 for (address = start ; address < end ; address += PUD_SIZE ) {
75737605 ptep = hugetlb_walk (vma , address , sz );
75747606 if (!ptep )
@@ -7578,8 +7610,10 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
75787610 spin_unlock (ptl );
75797611 }
75807612 flush_hugetlb_tlb_range (vma , start , end );
7581- i_mmap_unlock_write (vma -> vm_file -> f_mapping );
7582- hugetlb_vma_unlock_write (vma );
7613+ if (take_locks ) {
7614+ i_mmap_unlock_write (vma -> vm_file -> f_mapping );
7615+ hugetlb_vma_unlock_write (vma );
7616+ }
75837617 /*
75847618 * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
75857619 * Documentation/mm/mmu_notifier.rst.
@@ -7594,7 +7628,22 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
75947628void hugetlb_unshare_all_pmds (struct vm_area_struct * vma )
75957629{
75967630 hugetlb_unshare_pmds (vma , ALIGN (vma -> vm_start , PUD_SIZE ),
7597- ALIGN_DOWN (vma -> vm_end , PUD_SIZE ));
7631+ ALIGN_DOWN (vma -> vm_end , PUD_SIZE ),
7632+ /* take_locks = */ true);
7633+ }
7634+
7635+ /*
7636+ * For hugetlb, mremap() is an odd edge case - while the VMA copying is
7637+ * performed, we permit both the old and new VMAs to reference the same
7638+ * reservation.
7639+ *
7640+ * We fix this up after the operation succeeds, or if a newly allocated VMA
7641+ * is closed as a result of a failure to allocate memory.
7642+ */
7643+ void fixup_hugetlb_reservations (struct vm_area_struct * vma )
7644+ {
7645+ if (is_vm_hugetlb_page (vma ))
7646+ clear_vma_resv_huge_pages (vma );
75987647}
75997648
76007649#ifdef CONFIG_CMA
0 commit comments