Skip to content

Commit f5548c3

Browse files
pedrodemargomesakpm00
authored andcommitted
ksm: use range-walk function to jump over holes in scan_get_next_rmap_item
Currently, scan_get_next_rmap_item() walks every page address in a VMA to locate mergeable pages. This becomes highly inefficient when scanning large virtual memory areas that contain mostly unmapped regions, causing ksmd to use large amount of cpu without deduplicating much pages. This patch replaces the per-address lookup with a range walk using walk_page_range(). The range walker allows KSM to skip over entire unmapped holes in a VMA, avoiding unnecessary lookups. This problem was previously discussed in [1]. Consider the following test program which creates a 32 TiB mapping in the virtual address space but only populates a single page: #include <unistd.h> #include <stdio.h> #include <sys/mman.h> /* 32 TiB */ const size_t size = 32ul * 1024 * 1024 * 1024 * 1024; int main() { char *area = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_NORESERVE | MAP_PRIVATE | MAP_ANON, -1, 0); if (area == MAP_FAILED) { perror("mmap() failed\n"); return -1; } /* Populate a single page such that we get an anon_vma. */ *area = 0; /* Enable KSM. */ madvise(area, size, MADV_MERGEABLE); pause(); return 0; } $ ./ksm-sparse & $ echo 1 > /sys/kernel/mm/ksm/run Without this patch ksmd uses 100% of the cpu for a long time (more then 1 hour in my test machine) scanning all the 32 TiB virtual address space that contain only one mapped page. This makes ksmd essentially deadlocked not able to deduplicate anything of value. With this patch ksmd walks only the one mapped page and skips the rest of the 32 TiB virtual address space, making the scan fast using little cpu. Link: https://lkml.kernel.org/r/20251023035841.41406-1-pedrodemargomes@gmail.com Link: https://lkml.kernel.org/r/20251022153059.22763-1-pedrodemargomes@gmail.com Link: https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/ [1] Fixes: 31dbd01 ("ksm: Kernel SamePage Merging") Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@gmail.com> Co-developed-by: David Hildenbrand <david@redhat.com> Signed-off-by: David Hildenbrand <david@redhat.com> Reported-by: craftfever <craftfever@airmail.cc> Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io Suggested-by: David Hildenbrand <david@redhat.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Chengming Zhou <chengming.zhou@linux.dev> Cc: xu xin <xu.xin16@zte.com.cn> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
1 parent 7e76b75 commit f5548c3

File tree

1 file changed

+104
-9
lines changed

1 file changed

+104
-9
lines changed

mm/ksm.c

Lines changed: 104 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2455,6 +2455,95 @@ static bool should_skip_rmap_item(struct folio *folio,
24552455
return true;
24562456
}
24572457

2458+
struct ksm_next_page_arg {
2459+
struct folio *folio;
2460+
struct page *page;
2461+
unsigned long addr;
2462+
};
2463+
2464+
static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
2465+
struct mm_walk *walk)
2466+
{
2467+
struct ksm_next_page_arg *private = walk->private;
2468+
struct vm_area_struct *vma = walk->vma;
2469+
pte_t *start_ptep = NULL, *ptep, pte;
2470+
struct mm_struct *mm = walk->mm;
2471+
struct folio *folio;
2472+
struct page *page;
2473+
spinlock_t *ptl;
2474+
pmd_t pmd;
2475+
2476+
if (ksm_test_exit(mm))
2477+
return 0;
2478+
2479+
cond_resched();
2480+
2481+
pmd = pmdp_get_lockless(pmdp);
2482+
if (!pmd_present(pmd))
2483+
return 0;
2484+
2485+
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) {
2486+
ptl = pmd_lock(mm, pmdp);
2487+
pmd = pmdp_get(pmdp);
2488+
2489+
if (!pmd_present(pmd)) {
2490+
goto not_found_unlock;
2491+
} else if (pmd_leaf(pmd)) {
2492+
page = vm_normal_page_pmd(vma, addr, pmd);
2493+
if (!page)
2494+
goto not_found_unlock;
2495+
folio = page_folio(page);
2496+
2497+
if (folio_is_zone_device(folio) || !folio_test_anon(folio))
2498+
goto not_found_unlock;
2499+
2500+
page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT);
2501+
goto found_unlock;
2502+
}
2503+
spin_unlock(ptl);
2504+
}
2505+
2506+
start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
2507+
if (!start_ptep)
2508+
return 0;
2509+
2510+
for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
2511+
pte = ptep_get(ptep);
2512+
2513+
if (!pte_present(pte))
2514+
continue;
2515+
2516+
page = vm_normal_page(vma, addr, pte);
2517+
if (!page)
2518+
continue;
2519+
folio = page_folio(page);
2520+
2521+
if (folio_is_zone_device(folio) || !folio_test_anon(folio))
2522+
continue;
2523+
goto found_unlock;
2524+
}
2525+
2526+
not_found_unlock:
2527+
spin_unlock(ptl);
2528+
if (start_ptep)
2529+
pte_unmap(start_ptep);
2530+
return 0;
2531+
found_unlock:
2532+
folio_get(folio);
2533+
spin_unlock(ptl);
2534+
if (start_ptep)
2535+
pte_unmap(start_ptep);
2536+
private->page = page;
2537+
private->folio = folio;
2538+
private->addr = addr;
2539+
return 1;
2540+
}
2541+
2542+
static struct mm_walk_ops ksm_next_page_ops = {
2543+
.pmd_entry = ksm_next_page_pmd_entry,
2544+
.walk_lock = PGWALK_RDLOCK,
2545+
};
2546+
24582547
static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
24592548
{
24602549
struct mm_struct *mm;
@@ -2542,21 +2631,27 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
25422631
ksm_scan.address = vma->vm_end;
25432632

25442633
while (ksm_scan.address < vma->vm_end) {
2634+
struct ksm_next_page_arg ksm_next_page_arg;
25452635
struct page *tmp_page = NULL;
2546-
struct folio_walk fw;
25472636
struct folio *folio;
25482637

25492638
if (ksm_test_exit(mm))
25502639
break;
25512640

2552-
folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
2553-
if (folio) {
2554-
if (!folio_is_zone_device(folio) &&
2555-
folio_test_anon(folio)) {
2556-
folio_get(folio);
2557-
tmp_page = fw.page;
2558-
}
2559-
folio_walk_end(&fw, vma);
2641+
int found;
2642+
2643+
found = walk_page_range_vma(vma, ksm_scan.address,
2644+
vma->vm_end,
2645+
&ksm_next_page_ops,
2646+
&ksm_next_page_arg);
2647+
2648+
if (found > 0) {
2649+
folio = ksm_next_page_arg.folio;
2650+
tmp_page = ksm_next_page_arg.page;
2651+
ksm_scan.address = ksm_next_page_arg.addr;
2652+
} else {
2653+
VM_WARN_ON_ONCE(found < 0);
2654+
ksm_scan.address = vma->vm_end - PAGE_SIZE;
25602655
}
25612656

25622657
if (tmp_page) {

0 commit comments

Comments
 (0)