Skip to content

Commit 38607c6

Browse files
apopple-nvidiaakpm00
authored andcommitted
fs/dax: properly refcount fs dax pages
Currently fs dax pages are considered free when the refcount drops to one and their refcounts are not increased when mapped via PTEs or decreased when unmapped. This requires special logic in mm paths to detect that these pages should not be properly refcounted, and to detect when the refcount drops to one instead of zero. On the other hand get_user_pages(), etc. will properly refcount fs dax pages by taking a reference and dropping it when the page is unpinned. Tracking this special behaviour requires extra PTE bits (eg. pte_devmap) and introduces rules that are potentially confusing and specific to FS DAX pages. To fix this, and to possibly allow removal of the special PTE bits in future, convert the fs dax page refcounts to be zero based and instead take a reference on the page each time it is mapped as is currently the case for normal pages. This may also allow a future clean-up to remove the pgmap refcounting that is currently done in mm/gup.c. Link: https://lkml.kernel.org/r/c7d886ad7468a20452ef6e0ddab6cfe220874e7c.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple <apopple@nvidia.com> Reviewed-by: Dan Williams <dan.j.williams@intel.com> Tested-by: Alison Schofield <alison.schofield@intel.com> Acked-by: David Hildenbrand <david@redhat.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Asahi Lina <lina@asahilina.net> Cc: Balbir Singh <balbirs@nvidia.com> Cc: Bjorn Helgaas <bhelgaas@google.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Christian Borntraeger <borntraeger@linux.ibm.com> Cc: Christoph Hellwig <hch@lst.de> Cc: Chunyan Zhang <zhang.lyra@gmail.com> Cc: "Darrick J. Wong" <djwong@kernel.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Dave Jiang <dave.jiang@intel.com> Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Huacai Chen <chenhuacai@kernel.org> Cc: Ira Weiny <ira.weiny@intel.com> Cc: Jan Kara <jack@suse.cz> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: John Hubbard <jhubbard@nvidia.com> Cc: linmiaohe <linmiaohe@huawei.com> Cc: Logan Gunthorpe <logang@deltatee.com> Cc: Matthew Wilcow (Oracle) <willy@infradead.org> Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Peter Xu <peterx@redhat.com> Cc: Sven Schnelle <svens@linux.ibm.com> Cc: Ted Ts'o <tytso@mit.edu> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Vishal Verma <vishal.l.verma@intel.com> Cc: Vivek Goyal <vgoyal@redhat.com> Cc: WANG Xuerui <kernel@xen0n.name> Cc: Will Deacon <will@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
1 parent 653d782 commit 38607c6

File tree

14 files changed

+165
-151
lines changed

14 files changed

+165
-151
lines changed

drivers/nvdimm/pmem.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -513,7 +513,7 @@ static int pmem_attach_disk(struct device *dev,
513513

514514
pmem->disk = disk;
515515
pmem->pgmap.owner = pmem;
516-
pmem->pfn_flags = PFN_DEV;
516+
pmem->pfn_flags = 0;
517517
if (is_nd_pfn(dev)) {
518518
pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
519519
pmem->pgmap.ops = &fsdax_pagemap_ops;
@@ -522,7 +522,6 @@ static int pmem_attach_disk(struct device *dev,
522522
pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
523523
pmem->pfn_pad = resource_size(res) -
524524
range_len(&pmem->pgmap.range);
525-
pmem->pfn_flags |= PFN_MAP;
526525
bb_range = pmem->pgmap.range;
527526
bb_range.start += pmem->data_offset;
528527
} else if (pmem_should_map_pages(dev)) {
@@ -532,7 +531,6 @@ static int pmem_attach_disk(struct device *dev,
532531
pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
533532
pmem->pgmap.ops = &fsdax_pagemap_ops;
534533
addr = devm_memremap_pages(dev, &pmem->pgmap);
535-
pmem->pfn_flags |= PFN_MAP;
536534
bb_range = pmem->pgmap.range;
537535
} else {
538536
addr = devm_memremap(dev, pmem->phys_addr,

fs/dax.c

Lines changed: 114 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,11 @@ static unsigned long dax_to_pfn(void *entry)
7171
return xa_to_value(entry) >> DAX_SHIFT;
7272
}
7373

74+
static struct folio *dax_to_folio(void *entry)
75+
{
76+
return page_folio(pfn_to_page(dax_to_pfn(entry)));
77+
}
78+
7479
static void *dax_make_entry(pfn_t pfn, unsigned long flags)
7580
{
7681
return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
@@ -338,19 +343,6 @@ static unsigned long dax_entry_size(void *entry)
338343
return PAGE_SIZE;
339344
}
340345

341-
static unsigned long dax_end_pfn(void *entry)
342-
{
343-
return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
344-
}
345-
346-
/*
347-
* Iterate through all mapped pfns represented by an entry, i.e. skip
348-
* 'empty' and 'zero' entries.
349-
*/
350-
#define for_each_mapped_pfn(entry, pfn) \
351-
for (pfn = dax_to_pfn(entry); \
352-
pfn < dax_end_pfn(entry); pfn++)
353-
354346
/*
355347
* A DAX folio is considered shared if it has no mapping set and ->share (which
356348
* shares the ->index field) is non-zero. Note this may return false even if the
@@ -359,7 +351,7 @@ static unsigned long dax_end_pfn(void *entry)
359351
*/
360352
static inline bool dax_folio_is_shared(struct folio *folio)
361353
{
362-
return !folio->mapping && folio->page.share;
354+
return !folio->mapping && folio->share;
363355
}
364356

365357
/*
@@ -384,75 +376,117 @@ static void dax_folio_make_shared(struct folio *folio)
384376
* folio has previously been mapped into one address space so set the
385377
* share count.
386378
*/
387-
folio->page.share = 1;
379+
folio->share = 1;
388380
}
389381

390-
static inline unsigned long dax_folio_share_put(struct folio *folio)
382+
static inline unsigned long dax_folio_put(struct folio *folio)
391383
{
392-
return --folio->page.share;
384+
unsigned long ref;
385+
int order, i;
386+
387+
if (!dax_folio_is_shared(folio))
388+
ref = 0;
389+
else
390+
ref = --folio->share;
391+
392+
if (ref)
393+
return ref;
394+
395+
folio->mapping = NULL;
396+
order = folio_order(folio);
397+
if (!order)
398+
return 0;
399+
400+
for (i = 0; i < (1UL << order); i++) {
401+
struct dev_pagemap *pgmap = page_pgmap(&folio->page);
402+
struct page *page = folio_page(folio, i);
403+
struct folio *new_folio = (struct folio *)page;
404+
405+
ClearPageHead(page);
406+
clear_compound_head(page);
407+
408+
new_folio->mapping = NULL;
409+
/*
410+
* Reset pgmap which was over-written by
411+
* prep_compound_page().
412+
*/
413+
new_folio->pgmap = pgmap;
414+
new_folio->share = 0;
415+
WARN_ON_ONCE(folio_ref_count(new_folio));
416+
}
417+
418+
return ref;
419+
}
420+
421+
static void dax_folio_init(void *entry)
422+
{
423+
struct folio *folio = dax_to_folio(entry);
424+
int order = dax_entry_order(entry);
425+
426+
/*
427+
* Folio should have been split back to order-0 pages in
428+
* dax_folio_put() when they were removed from their
429+
* final mapping.
430+
*/
431+
WARN_ON_ONCE(folio_order(folio));
432+
433+
if (order > 0) {
434+
prep_compound_page(&folio->page, order);
435+
if (order > 1)
436+
INIT_LIST_HEAD(&folio->_deferred_list);
437+
WARN_ON_ONCE(folio_ref_count(folio));
438+
}
393439
}
394440

395441
static void dax_associate_entry(void *entry, struct address_space *mapping,
396-
struct vm_area_struct *vma, unsigned long address, bool shared)
442+
struct vm_area_struct *vma,
443+
unsigned long address, bool shared)
397444
{
398-
unsigned long size = dax_entry_size(entry), pfn, index;
399-
int i = 0;
445+
unsigned long size = dax_entry_size(entry), index;
446+
struct folio *folio = dax_to_folio(entry);
400447

401448
if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
402449
return;
403450

404451
index = linear_page_index(vma, address & ~(size - 1));
405-
for_each_mapped_pfn(entry, pfn) {
406-
struct folio *folio = pfn_folio(pfn);
407-
408-
if (shared && (folio->mapping || folio->page.share)) {
409-
if (folio->mapping)
410-
dax_folio_make_shared(folio);
452+
if (shared && (folio->mapping || dax_folio_is_shared(folio))) {
453+
if (folio->mapping)
454+
dax_folio_make_shared(folio);
411455

412-
WARN_ON_ONCE(!folio->page.share);
413-
folio->page.share++;
414-
} else {
415-
WARN_ON_ONCE(folio->mapping);
416-
folio->mapping = mapping;
417-
folio->index = index + i++;
418-
}
456+
WARN_ON_ONCE(!folio->share);
457+
WARN_ON_ONCE(dax_entry_order(entry) != folio_order(folio));
458+
folio->share++;
459+
} else {
460+
WARN_ON_ONCE(folio->mapping);
461+
dax_folio_init(entry);
462+
folio = dax_to_folio(entry);
463+
folio->mapping = mapping;
464+
folio->index = index;
419465
}
420466
}
421467

422468
static void dax_disassociate_entry(void *entry, struct address_space *mapping,
423-
bool trunc)
469+
bool trunc)
424470
{
425-
unsigned long pfn;
471+
struct folio *folio = dax_to_folio(entry);
426472

427473
if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
428474
return;
429475

430-
for_each_mapped_pfn(entry, pfn) {
431-
struct folio *folio = pfn_folio(pfn);
432-
433-
WARN_ON_ONCE(trunc && folio_ref_count(folio) > 1);
434-
if (dax_folio_is_shared(folio)) {
435-
/* keep the shared flag if this page is still shared */
436-
if (dax_folio_share_put(folio) > 0)
437-
continue;
438-
} else
439-
WARN_ON_ONCE(folio->mapping && folio->mapping != mapping);
440-
folio->mapping = NULL;
441-
folio->index = 0;
442-
}
476+
dax_folio_put(folio);
443477
}
444478

445479
static struct page *dax_busy_page(void *entry)
446480
{
447-
unsigned long pfn;
481+
struct folio *folio = dax_to_folio(entry);
448482

449-
for_each_mapped_pfn(entry, pfn) {
450-
struct page *page = pfn_to_page(pfn);
483+
if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))
484+
return NULL;
451485

452-
if (page_ref_count(page) > 1)
453-
return page;
454-
}
455-
return NULL;
486+
if (folio_ref_count(folio) - folio_mapcount(folio))
487+
return &folio->page;
488+
else
489+
return NULL;
456490
}
457491

458492
/**
@@ -785,7 +819,7 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
785819
EXPORT_SYMBOL_GPL(dax_layout_busy_page);
786820

787821
static int __dax_invalidate_entry(struct address_space *mapping,
788-
pgoff_t index, bool trunc)
822+
pgoff_t index, bool trunc)
789823
{
790824
XA_STATE(xas, &mapping->i_pages, index);
791825
int ret = 0;
@@ -953,7 +987,8 @@ void dax_break_layout_final(struct inode *inode)
953987
wait_page_idle_uninterruptible(page, inode);
954988
} while (true);
955989

956-
dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX);
990+
if (!page)
991+
dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX);
957992
}
958993
EXPORT_SYMBOL_GPL(dax_break_layout_final);
959994

@@ -1039,8 +1074,10 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
10391074
void *old;
10401075

10411076
dax_disassociate_entry(entry, mapping, false);
1042-
dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
1043-
shared);
1077+
if (!(flags & DAX_ZERO_PAGE))
1078+
dax_associate_entry(new_entry, mapping, vmf->vma,
1079+
vmf->address, shared);
1080+
10441081
/*
10451082
* Only swap our new entry into the page cache if the current
10461083
* entry is a zero page or an empty entry. If a normal PTE or
@@ -1228,9 +1265,7 @@ static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
12281265
goto out;
12291266
if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
12301267
goto out;
1231-
/* For larger pages we need devmap */
1232-
if (length > 1 && !pfn_t_devmap(*pfnp))
1233-
goto out;
1268+
12341269
rc = 0;
12351270

12361271
out_check_addr:
@@ -1337,7 +1372,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
13371372

13381373
*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
13391374

1340-
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
1375+
ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), false);
13411376
trace_dax_load_hole(inode, vmf, ret);
13421377
return ret;
13431378
}
@@ -1808,7 +1843,8 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
18081843
loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
18091844
bool write = iter->flags & IOMAP_WRITE;
18101845
unsigned long entry_flags = pmd ? DAX_PMD : 0;
1811-
int err = 0;
1846+
struct folio *folio;
1847+
int ret, err = 0;
18121848
pfn_t pfn;
18131849
void *kaddr;
18141850

@@ -1840,17 +1876,19 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
18401876
return dax_fault_return(err);
18411877
}
18421878

1879+
folio = dax_to_folio(*entry);
18431880
if (dax_fault_is_synchronous(iter, vmf->vma))
18441881
return dax_fault_synchronous_pfnp(pfnp, pfn);
18451882

1846-
/* insert PMD pfn */
1883+
folio_ref_inc(folio);
18471884
if (pmd)
1848-
return vmf_insert_pfn_pmd(vmf, pfn, write);
1885+
ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn_t_to_pfn(pfn)),
1886+
write);
1887+
else
1888+
ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), write);
1889+
folio_put(folio);
18491890

1850-
/* insert PTE pfn */
1851-
if (write)
1852-
return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
1853-
return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
1891+
return ret;
18541892
}
18551893

18561894
static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
@@ -2089,6 +2127,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
20892127
{
20902128
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
20912129
XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
2130+
struct folio *folio;
20922131
void *entry;
20932132
vm_fault_t ret;
20942133

@@ -2106,14 +2145,17 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
21062145
xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
21072146
dax_lock_entry(&xas, entry);
21082147
xas_unlock_irq(&xas);
2148+
folio = pfn_folio(pfn_t_to_pfn(pfn));
2149+
folio_ref_inc(folio);
21092150
if (order == 0)
2110-
ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
2151+
ret = vmf_insert_page_mkwrite(vmf, &folio->page, true);
21112152
#ifdef CONFIG_FS_DAX_PMD
21122153
else if (order == PMD_ORDER)
2113-
ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
2154+
ret = vmf_insert_folio_pmd(vmf, folio, FAULT_FLAG_WRITE);
21142155
#endif
21152156
else
21162157
ret = VM_FAULT_FALLBACK;
2158+
folio_put(folio);
21172159
dax_unlock_entry(&xas, entry);
21182160
trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
21192161
return ret;

fs/fuse/virtio_fs.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,8 +1017,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
10171017
if (kaddr)
10181018
*kaddr = fs->window_kaddr + offset;
10191019
if (pfn)
1020-
*pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
1021-
PFN_DEV | PFN_MAP);
1020+
*pfn = phys_to_pfn_t(fs->window_phys_addr + offset, 0);
10221021
return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
10231022
}
10241023

include/linux/dax.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
209209

210210
static inline bool dax_page_is_idle(struct page *page)
211211
{
212-
return page && page_ref_count(page) == 1;
212+
return page && page_ref_count(page) == 0;
213213
}
214214

215215
#if IS_ENABLED(CONFIG_DAX)

0 commit comments

Comments
 (0)