Skip to content

Commit 685b5e6

Browse files
committed
Merge: Endless calls to xas_split_alloc() due to corrupted xarray entry
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/4687 JIRA: https://issues.redhat.com/browse/RHEL-14441 Splitting a huge page on arm64-64k exposed a limitation on xarray. Backported these 4 upstream commits that resolved upstream: 9fd154b mm/shmem: disable PMD-sized page cache if needed 3390916 mm/filemap: skip to create PMD-sized page cache if needed 1f789a4 mm/readahead: limit page cache size in page_cache_ra_order() 099d906 mm/filemap: make MAX_PAGECACHE_ORDER acceptable to xarray For above to apply these 4 patches had to be backported from upstream as well: patches 1->4: ffc143d Subject: filemap: Add fgf_t typedef 4f66170 Subject: filemap: Allow __filemap_get_folio to all 79c1374 Subject: filemap: add helper mapping_max_folio_siz 099d906 Subject: mm/filemap: make MAX_PAGECACHE_ORDER acce patch before mm/readahead patch 6: e03c16f Subject: readahead: use ilog2 instead of a while l Signed-off-by: Donald Dutile <ddutile@redhat.com> Approved-by: Gavin Shan <gshan@redhat.com> Approved-by: Chris von Recklinghausen <crecklin@redhat.com> Approved-by: David Hildenbrand <david@redhat.com> Approved-by: Audra Mitchell <aubaker@redhat.com> Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com> Merged-by: Lucas Zampieri <lzampier@redhat.com>
2 parents 380f618 + 92d45ac commit 685b5e6

File tree

8 files changed

+143
-67
lines changed

8 files changed

+143
-67
lines changed

fs/f2fs/compress.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -996,7 +996,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc,
996996
struct address_space *mapping = cc->inode->i_mapping;
997997
struct page *page;
998998
sector_t last_block_in_bio;
999-
unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT;
999+
fgf_t fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT;
10001000
pgoff_t start_idx = start_idx_of_cluster(cc);
10011001
int i, ret;
10021002

fs/f2fs/f2fs.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2549,7 +2549,7 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
25492549

25502550
static inline struct page *f2fs_pagecache_get_page(
25512551
struct address_space *mapping, pgoff_t index,
2552-
int fgp_flags, gfp_t gfp_mask)
2552+
fgf_t fgp_flags, gfp_t gfp_mask)
25532553
{
25542554
if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) {
25552555
f2fs_show_injection_info(F2FS_M_SB(mapping), FAULT_PAGE_GET);

fs/iomap/buffered-io.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -544,7 +544,7 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
544544
*/
545545
struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
546546
{
547-
unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
547+
fgf_t fgp = FGP_WRITEBEGIN | FGP_NOFS;
548548

549549
if (iter->flags & IOMAP_NOWAIT)
550550
fgp |= FGP_NOWAIT;

include/linux/pagemap.h

Lines changed: 86 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -327,6 +327,26 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
327327
m->gfp_mask = mask;
328328
}
329329

330+
/*
331+
* There are some parts of the kernel which assume that PMD entries
332+
* are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then,
333+
* limit the maximum allocation order to PMD size. I'm not aware of any
334+
* assumptions about maximum order if THP are disabled, but 8 seems like
335+
* a good order (that's 1MB if you're using 4kB pages)
336+
*/
337+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
338+
#define PREFERRED_MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER
339+
#else
340+
#define PREFERRED_MAX_PAGECACHE_ORDER 8
341+
#endif
342+
343+
/*
344+
* xas_split_alloc() does not support arbitrary orders. This implies no
345+
* 512MB THP on ARM64 with 64KB base page size.
346+
*/
347+
#define MAX_XAS_ORDER (XA_CHUNK_SHIFT * 2 - 1)
348+
#define MAX_PAGECACHE_ORDER min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER)
349+
330350
/**
331351
* mapping_set_large_folios() - Indicate the file supports large folios.
332352
* @mapping: The file.
@@ -353,6 +373,14 @@ static inline bool mapping_large_folio_support(struct address_space *mapping)
353373
test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags);
354374
}
355375

376+
/* Return the maximum folio size for this pagecache mapping, in bytes. */
377+
static inline size_t mapping_max_folio_size(struct address_space *mapping)
378+
{
379+
if (mapping_large_folio_support(mapping))
380+
return PAGE_SIZE << MAX_PAGECACHE_ORDER;
381+
return PAGE_SIZE;
382+
}
383+
356384
static inline int filemap_nr_thps(struct address_space *mapping)
357385
{
358386
#ifdef CONFIG_READ_ONLY_THP_FOR_FS
@@ -534,22 +562,69 @@ pgoff_t page_cache_next_miss(struct address_space *mapping,
534562
pgoff_t page_cache_prev_miss(struct address_space *mapping,
535563
pgoff_t index, unsigned long max_scan);
536564

537-
#define FGP_ACCESSED 0x00000001
538-
#define FGP_LOCK 0x00000002
539-
#define FGP_CREAT 0x00000004
540-
#define FGP_WRITE 0x00000008
541-
#define FGP_NOFS 0x00000010
542-
#define FGP_NOWAIT 0x00000020
543-
#define FGP_FOR_MMAP 0x00000040
544-
#define FGP_STABLE 0x00000080
565+
/**
566+
* typedef fgf_t - Flags for getting folios from the page cache.
567+
*
568+
* Most users of the page cache will not need to use these flags;
569+
* there are convenience functions such as filemap_get_folio() and
570+
* filemap_lock_folio(). For users which need more control over exactly
571+
* what is done with the folios, these flags to __filemap_get_folio()
572+
* are available.
573+
*
574+
* * %FGP_ACCESSED - The folio will be marked accessed.
575+
* * %FGP_LOCK - The folio is returned locked.
576+
* * %FGP_CREAT - If no folio is present then a new folio is allocated,
577+
* added to the page cache and the VM's LRU list. The folio is
578+
* returned locked.
579+
* * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
580+
* folio is already in cache. If the folio was allocated, unlock it
581+
* before returning so the caller can do the same dance.
582+
* * %FGP_WRITE - The folio will be written to by the caller.
583+
* * %FGP_NOFS - __GFP_FS will get cleared in gfp.
584+
* * %FGP_NOWAIT - Don't block on the folio lock.
585+
* * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
586+
* * %FGP_WRITEBEGIN - The flags to use in a filesystem write_begin()
587+
* implementation.
588+
*/
589+
typedef unsigned int __bitwise fgf_t;
590+
591+
#define FGP_ACCESSED ((__force fgf_t)0x00000001)
592+
#define FGP_LOCK ((__force fgf_t)0x00000002)
593+
#define FGP_CREAT ((__force fgf_t)0x00000004)
594+
#define FGP_WRITE ((__force fgf_t)0x00000008)
595+
#define FGP_NOFS ((__force fgf_t)0x00000010)
596+
#define FGP_NOWAIT ((__force fgf_t)0x00000020)
597+
#define FGP_FOR_MMAP ((__force fgf_t)0x00000040)
598+
#define FGP_STABLE ((__force fgf_t)0x00000080)
599+
#define FGF_GET_ORDER(fgf) (((__force unsigned)fgf) >> 26) /* top 6 bits */
600+
601+
/**
602+
* fgf_set_order - Encode a length in the fgf_t flags.
603+
* @size: The suggested size of the folio to create.
604+
*
605+
* The caller of __filemap_get_folio() can use this to suggest a preferred
606+
* size for the folio that is created. If there is already a folio at
607+
* the index, it will be returned, no matter what its size. If a folio
608+
* is freshly created, it may be of a different size than requested
609+
* due to alignment constraints, memory pressure, or the presence of
610+
* other folios at nearby indices.
611+
*/
612+
static inline fgf_t fgf_set_order(size_t size)
613+
{
614+
unsigned int shift = ilog2(size);
615+
616+
if (shift <= PAGE_SHIFT)
617+
return 0;
618+
return (__force fgf_t)((shift - PAGE_SHIFT) << 26);
619+
}
545620

546621
void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
547622
#define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)
548623

549624
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
550-
int fgp_flags, gfp_t gfp);
625+
fgf_t fgp_flags, gfp_t gfp);
551626
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
552-
int fgp_flags, gfp_t gfp);
627+
fgf_t fgp_flags, gfp_t gfp);
553628

554629
/**
555630
* filemap_get_folio - Find and get a folio.
@@ -623,7 +698,7 @@ static inline struct page *find_get_page(struct address_space *mapping,
623698
}
624699

625700
static inline struct page *find_get_page_flags(struct address_space *mapping,
626-
pgoff_t offset, int fgp_flags)
701+
pgoff_t offset, fgf_t fgp_flags)
627702
{
628703
return pagecache_get_page(mapping, offset, fgp_flags, 0);
629704
}

mm/filemap.c

Lines changed: 36 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1866,30 +1866,15 @@ void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
18661866
*
18671867
* Looks up the page cache entry at @mapping & @index.
18681868
*
1869-
* @fgp_flags can be zero or more of these flags:
1870-
*
1871-
* * %FGP_ACCESSED - The folio will be marked accessed.
1872-
* * %FGP_LOCK - The folio is returned locked.
1873-
* * %FGP_CREAT - If no page is present then a new page is allocated using
1874-
* @gfp and added to the page cache and the VM's LRU list.
1875-
* The page is returned locked and with an increased refcount.
1876-
* * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
1877-
* page is already in cache. If the page was allocated, unlock it before
1878-
* returning so the caller can do the same dance.
1879-
* * %FGP_WRITE - The page will be written to by the caller.
1880-
* * %FGP_NOFS - __GFP_FS will get cleared in gfp.
1881-
* * %FGP_NOWAIT - Don't get blocked by page lock.
1882-
* * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
1883-
*
18841869
* If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
18851870
* if the %GFP flags specified for %FGP_CREAT are atomic.
18861871
*
1887-
* If there is a page cache page, it is returned with an increased refcount.
1872+
* If this function returns a folio, it is returned with an increased refcount.
18881873
*
18891874
* Return: The found folio or an ERR_PTR() otherwise.
18901875
*/
18911876
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
1892-
int fgp_flags, gfp_t gfp)
1877+
fgf_t fgp_flags, gfp_t gfp)
18931878
{
18941879
struct folio *folio;
18951880

@@ -1931,7 +1916,9 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
19311916
folio_wait_stable(folio);
19321917
no_page:
19331918
if (!folio && (fgp_flags & FGP_CREAT)) {
1919+
unsigned order = FGF_GET_ORDER(fgp_flags);
19341920
int err;
1921+
19351922
if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
19361923
gfp |= __GFP_WRITE;
19371924
if (fgp_flags & FGP_NOFS)
@@ -1940,26 +1927,44 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
19401927
gfp &= ~GFP_KERNEL;
19411928
gfp |= GFP_NOWAIT | __GFP_NOWARN;
19421929
}
1943-
1944-
folio = filemap_alloc_folio(gfp, 0);
1945-
if (!folio)
1946-
return ERR_PTR(-ENOMEM);
1947-
19481930
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
19491931
fgp_flags |= FGP_LOCK;
19501932

1951-
/* Init accessed so avoid atomic mark_page_accessed later */
1952-
if (fgp_flags & FGP_ACCESSED)
1953-
__folio_set_referenced(folio);
1933+
if (!mapping_large_folio_support(mapping))
1934+
order = 0;
1935+
if (order > MAX_PAGECACHE_ORDER)
1936+
order = MAX_PAGECACHE_ORDER;
1937+
/* If we're not aligned, allocate a smaller folio */
1938+
if (index & ((1UL << order) - 1))
1939+
order = __ffs(index);
19541940

1955-
err = filemap_add_folio(mapping, folio, index, gfp);
1956-
if (unlikely(err)) {
1941+
do {
1942+
gfp_t alloc_gfp = gfp;
1943+
1944+
err = -ENOMEM;
1945+
if (order == 1)
1946+
order = 0;
1947+
if (order > 0)
1948+
alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
1949+
folio = filemap_alloc_folio(alloc_gfp, order);
1950+
if (!folio)
1951+
continue;
1952+
1953+
/* Init accessed so avoid atomic mark_page_accessed later */
1954+
if (fgp_flags & FGP_ACCESSED)
1955+
__folio_set_referenced(folio);
1956+
1957+
err = filemap_add_folio(mapping, folio, index, gfp);
1958+
if (!err)
1959+
break;
19571960
folio_put(folio);
19581961
folio = NULL;
1959-
if (err == -EEXIST)
1960-
goto repeat;
1961-
}
1962+
} while (order-- > 0);
19621963

1964+
if (err == -EEXIST)
1965+
goto repeat;
1966+
if (err)
1967+
return ERR_PTR(err);
19631968
/*
19641969
* filemap_add_folio locks the page, and for mmap
19651970
* we expect an unlocked page.
@@ -3210,7 +3215,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
32103215

32113216
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
32123217
/* Use the readahead code, even if readahead is disabled */
3213-
if (vm_flags & VM_HUGEPAGE) {
3218+
if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {
32143219
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
32153220
ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
32163221
ra->size = HPAGE_PMD_NR;

mm/folio-compat.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ EXPORT_SYMBOL(add_to_page_cache_lru);
9292

9393
noinline
9494
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
95-
int fgp_flags, gfp_t gfp)
95+
fgf_t fgp_flags, gfp_t gfp)
9696
{
9797
struct folio *folio;
9898

mm/readahead.c

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -462,19 +462,6 @@ static int try_context_readahead(struct address_space *mapping,
462462
return 1;
463463
}
464464

465-
/*
466-
* There are some parts of the kernel which assume that PMD entries
467-
* are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then,
468-
* limit the maximum allocation order to PMD size. I'm not aware of any
469-
* assumptions about maximum order if THP are disabled, but 8 seems like
470-
* a good order (that's 1MB if you're using 4kB pages)
471-
*/
472-
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
473-
#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER
474-
#else
475-
#define MAX_PAGECACHE_ORDER 8
476-
#endif
477-
478465
static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
479466
pgoff_t mark, unsigned int order, gfp_t gfp)
480467
{
@@ -513,13 +500,11 @@ void page_cache_ra_order(struct readahead_control *ractl,
513500

514501
limit = min(limit, index + ra->size - 1);
515502

516-
if (new_order < MAX_PAGECACHE_ORDER) {
503+
if (new_order < MAX_PAGECACHE_ORDER)
517504
new_order += 2;
518-
if (new_order > MAX_PAGECACHE_ORDER)
519-
new_order = MAX_PAGECACHE_ORDER;
520-
while ((1 << new_order) > ra->size)
521-
new_order--;
522-
}
505+
506+
new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order);
507+
new_order = min_t(unsigned int, new_order, ilog2(ra->size));
523508

524509
/* See comment in page_cache_ra_unbounded() */
525510
nofs = memalloc_nofs_save();

mm/shmem.c

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -469,8 +469,9 @@ static bool shmem_confirm_swap(struct address_space *mapping,
469469

470470
static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
471471

472-
bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
473-
struct mm_struct *mm, unsigned long vm_flags)
472+
static bool __shmem_is_huge(struct inode *inode, pgoff_t index,
473+
bool shmem_huge_force, struct mm_struct *mm,
474+
unsigned long vm_flags)
474475
{
475476
loff_t i_size;
476477

@@ -501,6 +502,16 @@ bool shmem_is_huge(struct inode *inode, pgoff_t index, bool shmem_huge_force,
501502
}
502503
}
503504

505+
bool shmem_is_huge(struct inode *inode, pgoff_t index,
506+
bool shmem_huge_force, struct mm_struct *mm,
507+
unsigned long vm_flags)
508+
{
509+
if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER)
510+
return false;
511+
512+
return __shmem_is_huge(inode, index, shmem_huge_force, mm, vm_flags);
513+
}
514+
504515
#if defined(CONFIG_SYSFS)
505516
static int shmem_parse_huge(const char *str)
506517
{

0 commit comments

Comments
 (0)