Skip to content

Commit 1faef9b

Browse files
committed
x86/mm/pat: restore large ROX pages after fragmentation
jira LE-4694 Rebuild_History Non-Buildable kernel-6.12.0-55.43.1.el10_0 commit-author Kirill A. Shutemov <kirill.shutemov@linux.intel.com> commit 41d8848 Change of attributes of the pages may lead to fragmentation of direct mapping over time and performance degradation when these pages contain executable code. With current code it's one way road: kernel tries to avoid splitting large pages, but it doesn't restore them back even if page attributes got compatible again. Any change to the mapping may potentially allow to restore large page. Add a hook to cpa_flush() path that will check if the pages in the range that were just touched can be mapped at PMD level. If the collapse at the PMD level succeeded, also attempt to collapse PUD level. The collapse logic runs only when a set_memory_ method explicitly sets CPA_COLLAPSE flag, for now this is only enabled in set_memory_rox(). CPUs don't like[1] to have to have TLB entries of different size for the same memory, but looks like it's okay as long as these entries have matching attributes[2]. Therefore it's critical to flush TLB before any following changes to the mapping. Note that we already allow for multiple TLB entries of different sizes for the same memory now in split_large_page() path. It's not a new situation. set_memory_4k() provides a way to use 4k pages on purpose. Kernel must not remap such pages as large. Re-use one of software PTE bits to indicate such pages. [1] See Erratum 383 of AMD Family 10h Processors [2] https://lore.kernel.org/linux-mm/1da1b025-cabc-6f04-bde5-e50830d1ecf0@amd.com/ [rppt@kernel.org: * s/restore/collapse/ * update formatting per peterz * use 'struct ptdesc' instead of 'struct page' for list of page tables to be freed * try to collapse PMD first and if it succeeds move on to PUD as peterz suggested * flush TLB twice: for changes done in the original CPA call and after collapsing of large pages * update commit message ] Signed-off-by: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Co-developed-by: "Mike Rapoport (Microsoft)" <rppt@kernel.org> Signed-off-by: "Mike Rapoport (Microsoft)" <rppt@kernel.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lore.kernel.org/r/20250126074733.1384926-4-rppt@kernel.org (cherry picked from commit 41d8848) Signed-off-by: Jonathan Maple <jmaple@ciq.com>
1 parent 8e7f899 commit 1faef9b

File tree

4 files changed

+219
-4
lines changed

4 files changed

+219
-4
lines changed

arch/x86/include/asm/pgtable_types.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
3434
#define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
3535
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
36+
#define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */
3637
#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
3738

3839
#ifdef CONFIG_X86_64
@@ -62,6 +63,7 @@
6263
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
6364
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
6465
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
66+
#define _PAGE_KERNEL_4K (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K)
6567
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
6668
#define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
6769
#define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)

arch/x86/mm/pat/set_memory.c

Lines changed: 213 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ static DEFINE_SPINLOCK(cpa_lock);
7575
#define CPA_ARRAY 2
7676
#define CPA_PAGES_ARRAY 4
7777
#define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */
78+
#define CPA_COLLAPSE 16 /* try to collapse large pages */
7879

7980
static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
8081
{
@@ -107,6 +108,18 @@ static void split_page_count(int level)
107108
direct_pages_count[level - 1] += PTRS_PER_PTE;
108109
}
109110

111+
static void collapse_page_count(int level)
112+
{
113+
direct_pages_count[level]++;
114+
if (system_state == SYSTEM_RUNNING) {
115+
if (level == PG_LEVEL_2M)
116+
count_vm_event(DIRECT_MAP_LEVEL2_COLLAPSE);
117+
else if (level == PG_LEVEL_1G)
118+
count_vm_event(DIRECT_MAP_LEVEL3_COLLAPSE);
119+
}
120+
direct_pages_count[level - 1] -= PTRS_PER_PTE;
121+
}
122+
110123
void arch_report_meminfo(struct seq_file *m)
111124
{
112125
seq_printf(m, "DirectMap4k: %8lu kB\n",
@@ -124,6 +137,7 @@ void arch_report_meminfo(struct seq_file *m)
124137
}
125138
#else
126139
static inline void split_page_count(int level) { }
140+
static inline void collapse_page_count(int level) { }
127141
#endif
128142

129143
#ifdef CONFIG_X86_CPA_STATISTICS
@@ -396,6 +410,40 @@ static void __cpa_flush_tlb(void *data)
396410
flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
397411
}
398412

413+
static int collapse_large_pages(unsigned long addr, struct list_head *pgtables);
414+
415+
static void cpa_collapse_large_pages(struct cpa_data *cpa)
416+
{
417+
unsigned long start, addr, end;
418+
struct ptdesc *ptdesc, *tmp;
419+
LIST_HEAD(pgtables);
420+
int collapsed = 0;
421+
int i;
422+
423+
if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
424+
for (i = 0; i < cpa->numpages; i++)
425+
collapsed += collapse_large_pages(__cpa_addr(cpa, i),
426+
&pgtables);
427+
} else {
428+
addr = __cpa_addr(cpa, 0);
429+
start = addr & PMD_MASK;
430+
end = addr + PAGE_SIZE * cpa->numpages;
431+
432+
for (addr = start; within(addr, start, end); addr += PMD_SIZE)
433+
collapsed += collapse_large_pages(addr, &pgtables);
434+
}
435+
436+
if (!collapsed)
437+
return;
438+
439+
flush_tlb_all();
440+
441+
list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) {
442+
list_del(&ptdesc->pt_list);
443+
__free_page(ptdesc_page(ptdesc));
444+
}
445+
}
446+
399447
static void cpa_flush(struct cpa_data *cpa, int cache)
400448
{
401449
unsigned int i;
@@ -404,7 +452,7 @@ static void cpa_flush(struct cpa_data *cpa, int cache)
404452

405453
if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
406454
cpa_flush_all(cache);
407-
return;
455+
goto collapse_large_pages;
408456
}
409457

410458
if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling)
@@ -413,7 +461,7 @@ static void cpa_flush(struct cpa_data *cpa, int cache)
413461
on_each_cpu(__cpa_flush_tlb, cpa, 1);
414462

415463
if (!cache)
416-
return;
464+
goto collapse_large_pages;
417465

418466
mb();
419467
for (i = 0; i < cpa->numpages; i++) {
@@ -429,6 +477,10 @@ static void cpa_flush(struct cpa_data *cpa, int cache)
429477
clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE);
430478
}
431479
mb();
480+
481+
collapse_large_pages:
482+
if (cpa->flags & CPA_COLLAPSE)
483+
cpa_collapse_large_pages(cpa);
432484
}
433485

434486
static bool overlaps(unsigned long r1_start, unsigned long r1_end,
@@ -1198,6 +1250,161 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
11981250
return 0;
11991251
}
12001252

1253+
static int collapse_pmd_page(pmd_t *pmd, unsigned long addr,
1254+
struct list_head *pgtables)
1255+
{
1256+
pmd_t _pmd, old_pmd;
1257+
pte_t *pte, first;
1258+
unsigned long pfn;
1259+
pgprot_t pgprot;
1260+
int i = 0;
1261+
1262+
addr &= PMD_MASK;
1263+
pte = pte_offset_kernel(pmd, addr);
1264+
first = *pte;
1265+
pfn = pte_pfn(first);
1266+
1267+
/* Make sure alignment is suitable */
1268+
if (PFN_PHYS(pfn) & ~PMD_MASK)
1269+
return 0;
1270+
1271+
/* The page is 4k intentionally */
1272+
if (pte_flags(first) & _PAGE_KERNEL_4K)
1273+
return 0;
1274+
1275+
/* Check that the rest of PTEs are compatible with the first one */
1276+
for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) {
1277+
pte_t entry = *pte;
1278+
1279+
if (!pte_present(entry))
1280+
return 0;
1281+
if (pte_flags(entry) != pte_flags(first))
1282+
return 0;
1283+
if (pte_pfn(entry) != pte_pfn(first) + i)
1284+
return 0;
1285+
}
1286+
1287+
old_pmd = *pmd;
1288+
1289+
/* Success: set up a large page */
1290+
pgprot = pgprot_4k_2_large(pte_pgprot(first));
1291+
pgprot_val(pgprot) |= _PAGE_PSE;
1292+
_pmd = pfn_pmd(pfn, pgprot);
1293+
set_pmd(pmd, _pmd);
1294+
1295+
/* Queue the page table to be freed after TLB flush */
1296+
list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables);
1297+
1298+
if (IS_ENABLED(CONFIG_X86_32) && !SHARED_KERNEL_PMD) {
1299+
struct page *page;
1300+
1301+
/* Update all PGD tables to use the same large page */
1302+
list_for_each_entry(page, &pgd_list, lru) {
1303+
pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr);
1304+
p4d_t *p4d = p4d_offset(pgd, addr);
1305+
pud_t *pud = pud_offset(p4d, addr);
1306+
pmd_t *pmd = pmd_offset(pud, addr);
1307+
/* Something is wrong if entries doesn't match */
1308+
if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd)))
1309+
continue;
1310+
set_pmd(pmd, _pmd);
1311+
}
1312+
}
1313+
1314+
if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1))
1315+
collapse_page_count(PG_LEVEL_2M);
1316+
1317+
return 1;
1318+
}
1319+
1320+
static int collapse_pud_page(pud_t *pud, unsigned long addr,
1321+
struct list_head *pgtables)
1322+
{
1323+
unsigned long pfn;
1324+
pmd_t *pmd, first;
1325+
int i;
1326+
1327+
if (!direct_gbpages)
1328+
return 0;
1329+
1330+
addr &= PUD_MASK;
1331+
pmd = pmd_offset(pud, addr);
1332+
first = *pmd;
1333+
1334+
/*
1335+
* To restore PUD page all PMD entries must be large and
1336+
* have suitable alignment
1337+
*/
1338+
pfn = pmd_pfn(first);
1339+
if (!pmd_leaf(first) || (PFN_PHYS(pfn) & ~PUD_MASK))
1340+
return 0;
1341+
1342+
/*
1343+
* To restore PUD page, all following PMDs must be compatible with the
1344+
* first one.
1345+
*/
1346+
for (i = 1, pmd++; i < PTRS_PER_PMD; i++, pmd++) {
1347+
pmd_t entry = *pmd;
1348+
1349+
if (!pmd_present(entry) || !pmd_leaf(entry))
1350+
return 0;
1351+
if (pmd_flags(entry) != pmd_flags(first))
1352+
return 0;
1353+
if (pmd_pfn(entry) != pmd_pfn(first) + i * PTRS_PER_PTE)
1354+
return 0;
1355+
}
1356+
1357+
/* Restore PUD page and queue page table to be freed after TLB flush */
1358+
list_add(&page_ptdesc(pud_page(*pud))->pt_list, pgtables);
1359+
set_pud(pud, pfn_pud(pfn, pmd_pgprot(first)));
1360+
1361+
if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1))
1362+
collapse_page_count(PG_LEVEL_1G);
1363+
1364+
return 1;
1365+
}
1366+
1367+
/*
1368+
* Collapse PMD and PUD pages in the kernel mapping around the address where
1369+
* possible.
1370+
*
1371+
* Caller must flush TLB and free page tables queued on the list before
1372+
* touching the new entries. CPU must not see TLB entries of different size
1373+
* with different attributes.
1374+
*/
1375+
static int collapse_large_pages(unsigned long addr, struct list_head *pgtables)
1376+
{
1377+
int collapsed = 0;
1378+
pgd_t *pgd;
1379+
p4d_t *p4d;
1380+
pud_t *pud;
1381+
pmd_t *pmd;
1382+
1383+
addr &= PMD_MASK;
1384+
1385+
spin_lock(&pgd_lock);
1386+
pgd = pgd_offset_k(addr);
1387+
if (pgd_none(*pgd))
1388+
goto out;
1389+
p4d = p4d_offset(pgd, addr);
1390+
if (p4d_none(*p4d))
1391+
goto out;
1392+
pud = pud_offset(p4d, addr);
1393+
if (!pud_present(*pud) || pud_leaf(*pud))
1394+
goto out;
1395+
pmd = pmd_offset(pud, addr);
1396+
if (!pmd_present(*pmd) || pmd_leaf(*pmd))
1397+
goto out;
1398+
1399+
collapsed = collapse_pmd_page(pmd, addr, pgtables);
1400+
if (collapsed)
1401+
collapsed += collapse_pud_page(pud, addr, pgtables);
1402+
1403+
out:
1404+
spin_unlock(&pgd_lock);
1405+
return collapsed;
1406+
}
1407+
12011408
static bool try_to_free_pte_page(pte_t *pte)
12021409
{
12031410
int i;
@@ -2121,7 +2328,8 @@ int set_memory_rox(unsigned long addr, int numpages)
21212328
if (__supported_pte_mask & _PAGE_NX)
21222329
clr.pgprot |= _PAGE_NX;
21232330

2124-
return change_page_attr_clear(&addr, numpages, clr, 0);
2331+
return change_page_attr_set_clr(&addr, numpages, __pgprot(0), clr, 0,
2332+
CPA_COLLAPSE, NULL);
21252333
}
21262334

21272335
int set_memory_rw(unsigned long addr, int numpages)
@@ -2148,7 +2356,8 @@ int set_memory_p(unsigned long addr, int numpages)
21482356

21492357
int set_memory_4k(unsigned long addr, int numpages)
21502358
{
2151-
return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
2359+
return change_page_attr_set_clr(&addr, numpages,
2360+
__pgprot(_PAGE_KERNEL_4K),
21522361
__pgprot(0), 1, 0, NULL);
21532362
}
21542363

include/linux/vm_event_item.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
151151
#ifdef CONFIG_X86
152152
DIRECT_MAP_LEVEL2_SPLIT,
153153
DIRECT_MAP_LEVEL3_SPLIT,
154+
DIRECT_MAP_LEVEL2_COLLAPSE,
155+
DIRECT_MAP_LEVEL3_COLLAPSE,
154156
#endif
155157
#ifdef CONFIG_PER_VMA_LOCK_STATS
156158
VMA_LOCK_SUCCESS,

mm/vmstat.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1432,6 +1432,8 @@ const char * const vmstat_text[] = {
14321432
#ifdef CONFIG_X86
14331433
"direct_map_level2_splits",
14341434
"direct_map_level3_splits",
1435+
"direct_map_level2_collapses",
1436+
"direct_map_level3_collapses",
14351437
#endif
14361438
#ifdef CONFIG_PER_VMA_LOCK_STATS
14371439
"vma_lock_success",

0 commit comments

Comments
 (0)