5656#include <linux/khugepaged.h>
5757#include <linux/rculist_nulls.h>
5858#include <linux/random.h>
59+ #include <linux/mmu_notifier.h>
5960
6061#include <asm/tlbflush.h>
6162#include <asm/div64.h>
@@ -3294,7 +3295,8 @@ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk
32943295 return false;
32953296}
32963297
3297- static unsigned long get_pte_pfn (pte_t pte , struct vm_area_struct * vma , unsigned long addr )
3298+ static unsigned long get_pte_pfn (pte_t pte , struct vm_area_struct * vma , unsigned long addr ,
3299+ struct pglist_data * pgdat )
32983300{
32993301 unsigned long pfn = pte_pfn (pte );
33003302
@@ -3306,13 +3308,20 @@ static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned
33063308 if (WARN_ON_ONCE (pte_devmap (pte ) || pte_special (pte )))
33073309 return -1 ;
33083310
3311+ if (!pte_young (pte ) && !mm_has_notifiers (vma -> vm_mm ))
3312+ return -1 ;
3313+
33093314 if (WARN_ON_ONCE (!pfn_valid (pfn )))
33103315 return -1 ;
33113316
3317+ if (pfn < pgdat -> node_start_pfn || pfn >= pgdat_end_pfn (pgdat ))
3318+ return -1 ;
3319+
33123320 return pfn ;
33133321}
33143322
3315- static unsigned long get_pmd_pfn (pmd_t pmd , struct vm_area_struct * vma , unsigned long addr )
3323+ static unsigned long get_pmd_pfn (pmd_t pmd , struct vm_area_struct * vma , unsigned long addr ,
3324+ struct pglist_data * pgdat )
33163325{
33173326 unsigned long pfn = pmd_pfn (pmd );
33183327
@@ -3324,9 +3333,15 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned
33243333 if (WARN_ON_ONCE (pmd_devmap (pmd )))
33253334 return -1 ;
33263335
3336+ if (!pmd_young (pmd ) && !mm_has_notifiers (vma -> vm_mm ))
3337+ return -1 ;
3338+
33273339 if (WARN_ON_ONCE (!pfn_valid (pfn )))
33283340 return -1 ;
33293341
3342+ if (pfn < pgdat -> node_start_pfn || pfn >= pgdat_end_pfn (pgdat ))
3343+ return -1 ;
3344+
33303345 return pfn ;
33313346}
33323347
@@ -3335,10 +3350,6 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
33353350{
33363351 struct folio * folio ;
33373352
3338- /* try to avoid unnecessary memory loads */
3339- if (pfn < pgdat -> node_start_pfn || pfn >= pgdat_end_pfn (pgdat ))
3340- return NULL ;
3341-
33423353 folio = pfn_folio (pfn );
33433354 if (folio_nid (folio ) != pgdat -> node_id )
33443355 return NULL ;
@@ -3394,20 +3405,16 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
33943405 total ++ ;
33953406 walk -> mm_stats [MM_LEAF_TOTAL ]++ ;
33963407
3397- pfn = get_pte_pfn (ptent , args -> vma , addr );
3408+ pfn = get_pte_pfn (ptent , args -> vma , addr , pgdat );
33983409 if (pfn == -1 )
33993410 continue ;
34003411
3401- if (!pte_young (ptent )) {
3402- continue ;
3403- }
3404-
34053412 folio = get_pfn_folio (pfn , memcg , pgdat , walk -> can_swap );
34063413 if (!folio )
34073414 continue ;
34083415
3409- if (!ptep_test_and_clear_young (args -> vma , addr , pte + i ))
3410- VM_WARN_ON_ONCE (true) ;
3416+ if (!ptep_clear_young_notify (args -> vma , addr , pte + i ))
3417+ continue ;
34113418
34123419 young ++ ;
34133420 walk -> mm_stats [MM_LEAF_YOUNG ]++ ;
@@ -3473,21 +3480,25 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
34733480 /* don't round down the first address */
34743481 addr = i ? (* first & PMD_MASK ) + i * PMD_SIZE : * first ;
34753482
3476- pfn = get_pmd_pfn (pmd [i ], vma , addr );
3477- if (pfn == -1 )
3483+ if (!pmd_present (pmd [i ]))
34783484 goto next ;
34793485
34803486 if (!pmd_trans_huge (pmd [i ])) {
3481- if (!walk -> force_scan && should_clear_pmd_young ())
3487+ if (!walk -> force_scan && should_clear_pmd_young () &&
3488+ !mm_has_notifiers (args -> mm ))
34823489 pmdp_test_and_clear_young (vma , addr , pmd + i );
34833490 goto next ;
34843491 }
34853492
3493+ pfn = get_pmd_pfn (pmd [i ], vma , addr , pgdat );
3494+ if (pfn == -1 )
3495+ goto next ;
3496+
34863497 folio = get_pfn_folio (pfn , memcg , pgdat , walk -> can_swap );
34873498 if (!folio )
34883499 goto next ;
34893500
3490- if (!pmdp_test_and_clear_young (vma , addr , pmd + i ))
3501+ if (!pmdp_clear_young_notify (vma , addr , pmd + i ))
34913502 goto next ;
34923503
34933504 walk -> mm_stats [MM_LEAF_YOUNG ]++ ;
@@ -3545,24 +3556,18 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
35453556 }
35463557
35473558 if (pmd_trans_huge (val )) {
3548- unsigned long pfn = pmd_pfn (val );
35493559 struct pglist_data * pgdat = lruvec_pgdat (walk -> lruvec );
3560+ unsigned long pfn = get_pmd_pfn (val , vma , addr , pgdat );
35503561
35513562 walk -> mm_stats [MM_LEAF_TOTAL ]++ ;
35523563
3553- if (!pmd_young (val )) {
3554- continue ;
3555- }
3556-
3557- /* try to avoid unnecessary memory loads */
3558- if (pfn < pgdat -> node_start_pfn || pfn >= pgdat_end_pfn (pgdat ))
3559- continue ;
3560-
3561- walk_pmd_range_locked (pud , addr , vma , args , bitmap , & first );
3564+ if (pfn != -1 )
3565+ walk_pmd_range_locked (pud , addr , vma , args , bitmap , & first );
35623566 continue ;
35633567 }
35643568
3565- if (!walk -> force_scan && should_clear_pmd_young ()) {
3569+ if (!walk -> force_scan && should_clear_pmd_young () &&
3570+ !mm_has_notifiers (args -> mm )) {
35663571 if (!pmd_young (val ))
35673572 continue ;
35683573
@@ -4036,13 +4041,13 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
40364041 * the PTE table to the Bloom filter. This forms a feedback loop between the
40374042 * eviction and the aging.
40384043 */
4039- void lru_gen_look_around (struct page_vma_mapped_walk * pvmw )
4044+ bool lru_gen_look_around (struct page_vma_mapped_walk * pvmw )
40404045{
40414046 int i ;
40424047 unsigned long start ;
40434048 unsigned long end ;
40444049 struct lru_gen_mm_walk * walk ;
4045- int young = 0 ;
4050+ int young = 1 ;
40464051 pte_t * pte = pvmw -> pte ;
40474052 unsigned long addr = pvmw -> address ;
40484053 struct vm_area_struct * vma = pvmw -> vma ;
@@ -4058,19 +4063,25 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
40584063 lockdep_assert_held (pvmw -> ptl );
40594064 VM_WARN_ON_ONCE_FOLIO (folio_test_lru (folio ), folio );
40604065
4066+ if (!ptep_clear_young_notify (vma , addr , pte ))
4067+ return false;
4068+
40614069 if (spin_is_contended (pvmw -> ptl ))
4062- return ;
4070+ return true ;
40634071
40644072 /* exclude special VMAs containing anon pages from COW */
40654073 if (vma -> vm_flags & VM_SPECIAL )
4066- return ;
4074+ return true ;
40674075
40684076 /* avoid taking the LRU lock under the PTL when possible */
40694077 walk = current -> reclaim_state ? current -> reclaim_state -> mm_walk : NULL ;
40704078
40714079 start = max (addr & PMD_MASK , vma -> vm_start );
40724080 end = min (addr | ~PMD_MASK , vma -> vm_end - 1 ) + 1 ;
40734081
4082+ if (end - start == PAGE_SIZE )
4083+ return true;
4084+
40744085 if (end - start > MIN_LRU_BATCH * PAGE_SIZE ) {
40754086 if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2 )
40764087 end = start + MIN_LRU_BATCH * PAGE_SIZE ;
@@ -4084,7 +4095,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
40844095
40854096 /* folio_update_gen() requires stable folio_memcg() */
40864097 if (!mem_cgroup_trylock_pages (memcg ))
4087- return ;
4098+ return true ;
40884099
40894100 arch_enter_lazy_mmu_mode ();
40904101
@@ -4094,19 +4105,16 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
40944105 unsigned long pfn ;
40954106 pte_t ptent = ptep_get (pte + i );
40964107
4097- pfn = get_pte_pfn (ptent , vma , addr );
4108+ pfn = get_pte_pfn (ptent , vma , addr , pgdat );
40984109 if (pfn == -1 )
40994110 continue ;
41004111
4101- if (!pte_young (ptent ))
4102- continue ;
4103-
41044112 folio = get_pfn_folio (pfn , memcg , pgdat , can_swap );
41054113 if (!folio )
41064114 continue ;
41074115
4108- if (!ptep_test_and_clear_young (vma , addr , pte + i ))
4109- VM_WARN_ON_ONCE (true) ;
4116+ if (!ptep_clear_young_notify (vma , addr , pte + i ))
4117+ continue ;
41104118
41114119 young ++ ;
41124120
@@ -4136,6 +4144,8 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
41364144 /* feedback from rmap walkers to page table walkers */
41374145 if (mm_state && suitable_to_scan (i , young ))
41384146 update_bloom_filter (mm_state , max_seq , pvmw -> pmd );
4147+
4148+ return true;
41394149}
41404150
41414151/******************************************************************************
0 commit comments