Skip to content

Commit 808c350

Browse files
committed
Merge: mm/gup, udmabuf: Complete memfd_pin_folios() for pinning memfd folios
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/6826 JIRA: https://issues.redhat.com/browse/RHEL-89519 MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/6826 Depends: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/6414 Tested: The udmabuf selftest was run successfully without failure on the patched kernel with the appropriated hugetlb setting. This MR completes the backport of the patch series "mm/gup: Introduce memfd_pin_folios() for pinning memfd folios" and all the associated and relevant fix commits as well as other patches to reduce merge conflicts. It also backports the updated udmabuf selftest patches. This is needed to support the AMD proposed solution to enable memory cgroup to limit memory consumption of GPU heavy applications. Signed-off-by: Waiman Long <longman@redhat.com> Approved-by: Donald Dutile <ddutile@redhat.com> Approved-by: Rafael Aquini <raquini@redhat.com> Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com> Merged-by: Jan Stancek <jstancek@redhat.com>
2 parents b8b966e + 400dd9d commit 808c350

File tree

13 files changed

+420
-147
lines changed

13 files changed

+420
-147
lines changed

Documentation/admin-guide/mm/hugetlbpage.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,13 @@ Note that the number of overcommit and reserve pages remain global quantities,
378378
as we don't know until fault time, when the faulting task's mempolicy is
379379
applied, from which node the huge page allocation will be attempted.
380380

381+
The hugetlb may be migrated between the per-node hugepages pool in the following
382+
scenarios: memory offline, memory failure, longterm pinning, syscalls(mbind,
383+
migrate_pages and move_pages), alloc_contig_range() and alloc_contig_pages().
384+
Now only memory offline, memory failure and syscalls allow fallbacking to allocate
385+
a new hugetlb on a different node if the current node is unable to allocate during
386+
hugetlb migration, that means these 3 cases can break the per-node hugepages pool.
387+
381388
.. _using_huge_pages:
382389

383390
Using Huge Pages

drivers/dma-buf/udmabuf.c

Lines changed: 152 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -30,22 +30,57 @@ struct udmabuf {
3030
struct sg_table *sg;
3131
struct miscdevice *device;
3232
pgoff_t *offsets;
33+
struct list_head unpin_list;
34+
};
35+
36+
struct udmabuf_folio {
37+
struct folio *folio;
38+
struct list_head list;
3339
};
3440

3541
static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf)
3642
{
3743
struct vm_area_struct *vma = vmf->vma;
3844
struct udmabuf *ubuf = vma->vm_private_data;
3945
pgoff_t pgoff = vmf->pgoff;
40-
unsigned long pfn;
46+
unsigned long addr, pfn;
47+
vm_fault_t ret;
4148

4249
if (pgoff >= ubuf->pagecount)
4350
return VM_FAULT_SIGBUS;
4451

4552
pfn = folio_pfn(ubuf->folios[pgoff]);
4653
pfn += ubuf->offsets[pgoff] >> PAGE_SHIFT;
4754

48-
return vmf_insert_pfn(vma, vmf->address, pfn);
55+
ret = vmf_insert_pfn(vma, vmf->address, pfn);
56+
if (ret & VM_FAULT_ERROR)
57+
return ret;
58+
59+
/* pre fault */
60+
pgoff = vma->vm_pgoff;
61+
addr = vma->vm_start;
62+
63+
for (; addr < vma->vm_end; pgoff++, addr += PAGE_SIZE) {
64+
if (addr == vmf->address)
65+
continue;
66+
67+
if (WARN_ON(pgoff >= ubuf->pagecount))
68+
break;
69+
70+
pfn = folio_pfn(ubuf->folios[pgoff]);
71+
pfn += ubuf->offsets[pgoff] >> PAGE_SHIFT;
72+
73+
/**
74+
* If the below vmf_insert_pfn() fails, we do not return an
75+
* error here during this pre-fault step. However, an error
76+
* will be returned if the failure occurs when the addr is
77+
* truly accessed.
78+
*/
79+
if (vmf_insert_pfn(vma, addr, pfn) & VM_FAULT_ERROR)
80+
break;
81+
}
82+
83+
return ret;
4984
}
5085

5186
static const struct vm_operations_struct udmabuf_vm_ops = {
@@ -161,17 +196,43 @@ static void unmap_udmabuf(struct dma_buf_attachment *at,
161196
return put_sg_table(at->dev, sg, direction);
162197
}
163198

199+
static void unpin_all_folios(struct list_head *unpin_list)
200+
{
201+
struct udmabuf_folio *ubuf_folio;
202+
203+
while (!list_empty(unpin_list)) {
204+
ubuf_folio = list_first_entry(unpin_list,
205+
struct udmabuf_folio, list);
206+
unpin_folio(ubuf_folio->folio);
207+
208+
list_del(&ubuf_folio->list);
209+
kfree(ubuf_folio);
210+
}
211+
}
212+
213+
static int add_to_unpin_list(struct list_head *unpin_list,
214+
struct folio *folio)
215+
{
216+
struct udmabuf_folio *ubuf_folio;
217+
218+
ubuf_folio = kzalloc(sizeof(*ubuf_folio), GFP_KERNEL);
219+
if (!ubuf_folio)
220+
return -ENOMEM;
221+
222+
ubuf_folio->folio = folio;
223+
list_add_tail(&ubuf_folio->list, unpin_list);
224+
return 0;
225+
}
226+
164227
static void release_udmabuf(struct dma_buf *buf)
165228
{
166229
struct udmabuf *ubuf = buf->priv;
167230
struct device *dev = ubuf->device->this_device;
168-
pgoff_t pg;
169231

170232
if (ubuf->sg)
171233
put_sg_table(dev, ubuf->sg, DMA_BIDIRECTIONAL);
172234

173-
for (pg = 0; pg < ubuf->pagecount; pg++)
174-
folio_put(ubuf->folios[pg]);
235+
unpin_all_folios(&ubuf->unpin_list);
175236
kvfree(ubuf->offsets);
176237
kvfree(ubuf->folios);
177238
kfree(ubuf);
@@ -226,71 +287,10 @@ static const struct dma_buf_ops udmabuf_ops = {
226287
#define SEALS_WANTED (F_SEAL_SHRINK)
227288
#define SEALS_DENIED (F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
228289

229-
static int handle_hugetlb_pages(struct udmabuf *ubuf, struct file *memfd,
230-
pgoff_t offset, pgoff_t pgcnt,
231-
pgoff_t *pgbuf)
232-
{
233-
struct hstate *hpstate = hstate_file(memfd);
234-
pgoff_t mapidx = offset >> huge_page_shift(hpstate);
235-
pgoff_t subpgoff = (offset & ~huge_page_mask(hpstate)) >> PAGE_SHIFT;
236-
pgoff_t maxsubpgs = huge_page_size(hpstate) >> PAGE_SHIFT;
237-
struct folio *folio = NULL;
238-
pgoff_t pgidx;
239-
240-
mapidx <<= huge_page_order(hpstate);
241-
for (pgidx = 0; pgidx < pgcnt; pgidx++) {
242-
if (!folio) {
243-
folio = __filemap_get_folio(memfd->f_mapping,
244-
mapidx,
245-
FGP_ACCESSED, 0);
246-
if (IS_ERR(folio))
247-
return PTR_ERR(folio);
248-
}
249-
250-
folio_get(folio);
251-
ubuf->folios[*pgbuf] = folio;
252-
ubuf->offsets[*pgbuf] = subpgoff << PAGE_SHIFT;
253-
(*pgbuf)++;
254-
if (++subpgoff == maxsubpgs) {
255-
folio_put(folio);
256-
folio = NULL;
257-
subpgoff = 0;
258-
mapidx += pages_per_huge_page(hpstate);
259-
}
260-
}
261-
262-
if (folio)
263-
folio_put(folio);
264-
265-
return 0;
266-
}
267-
268-
static int handle_shmem_pages(struct udmabuf *ubuf, struct file *memfd,
269-
pgoff_t offset, pgoff_t pgcnt,
270-
pgoff_t *pgbuf)
271-
{
272-
pgoff_t pgidx, pgoff = offset >> PAGE_SHIFT;
273-
struct folio *folio = NULL;
274-
275-
for (pgidx = 0; pgidx < pgcnt; pgidx++) {
276-
folio = shmem_read_folio(memfd->f_mapping, pgoff + pgidx);
277-
if (IS_ERR(folio))
278-
return PTR_ERR(folio);
279-
280-
ubuf->folios[*pgbuf] = folio;
281-
(*pgbuf)++;
282-
}
283-
284-
return 0;
285-
}
286-
287290
static int check_memfd_seals(struct file *memfd)
288291
{
289292
int seals;
290293

291-
if (!memfd)
292-
return -EBADFD;
293-
294294
if (!shmem_file(memfd) && !is_file_hugepages(memfd))
295295
return -EBADFD;
296296

@@ -319,69 +319,126 @@ static struct dma_buf *export_udmabuf(struct udmabuf *ubuf,
319319
return dma_buf_export(&exp_info);
320320
}
321321

322+
static long udmabuf_pin_folios(struct udmabuf *ubuf, struct file *memfd,
323+
loff_t start, loff_t size)
324+
{
325+
pgoff_t pgoff, pgcnt, upgcnt = ubuf->pagecount;
326+
struct folio **folios = NULL;
327+
u32 cur_folio, cur_pgcnt;
328+
long nr_folios;
329+
long ret = 0;
330+
loff_t end;
331+
332+
pgcnt = size >> PAGE_SHIFT;
333+
folios = kvmalloc_array(pgcnt, sizeof(*folios), GFP_KERNEL);
334+
if (!folios)
335+
return -ENOMEM;
336+
337+
end = start + (pgcnt << PAGE_SHIFT) - 1;
338+
nr_folios = memfd_pin_folios(memfd, start, end, folios, pgcnt, &pgoff);
339+
if (nr_folios <= 0) {
340+
ret = nr_folios ? nr_folios : -EINVAL;
341+
goto end;
342+
}
343+
344+
cur_pgcnt = 0;
345+
for (cur_folio = 0; cur_folio < nr_folios; ++cur_folio) {
346+
pgoff_t subpgoff = pgoff;
347+
size_t fsize = folio_size(folios[cur_folio]);
348+
349+
ret = add_to_unpin_list(&ubuf->unpin_list, folios[cur_folio]);
350+
if (ret < 0)
351+
goto end;
352+
353+
for (; subpgoff < fsize; subpgoff += PAGE_SIZE) {
354+
ubuf->folios[upgcnt] = folios[cur_folio];
355+
ubuf->offsets[upgcnt] = subpgoff;
356+
++upgcnt;
357+
358+
if (++cur_pgcnt >= pgcnt)
359+
goto end;
360+
}
361+
362+
/**
363+
* In a given range, only the first subpage of the first folio
364+
* has an offset, that is returned by memfd_pin_folios().
365+
* The first subpages of other folios (in the range) have an
366+
* offset of 0.
367+
*/
368+
pgoff = 0;
369+
}
370+
end:
371+
ubuf->pagecount = upgcnt;
372+
kvfree(folios);
373+
return ret;
374+
}
375+
322376
static long udmabuf_create(struct miscdevice *device,
323377
struct udmabuf_create_list *head,
324378
struct udmabuf_create_item *list)
325379
{
326-
pgoff_t pgcnt, pgbuf = 0, pglimit;
327-
struct file *memfd = NULL;
380+
pgoff_t pgcnt = 0, pglimit;
328381
struct udmabuf *ubuf;
329382
struct dma_buf *dmabuf;
330-
int ret = -EINVAL;
383+
long ret = -EINVAL;
331384
u32 i, flags;
332385

333386
ubuf = kzalloc(sizeof(*ubuf), GFP_KERNEL);
334387
if (!ubuf)
335388
return -ENOMEM;
336389

390+
INIT_LIST_HEAD(&ubuf->unpin_list);
337391
pglimit = (size_limit_mb * 1024 * 1024) >> PAGE_SHIFT;
338392
for (i = 0; i < head->count; i++) {
339-
if (!IS_ALIGNED(list[i].offset, PAGE_SIZE))
393+
if (!PAGE_ALIGNED(list[i].offset))
340394
goto err;
341-
if (!IS_ALIGNED(list[i].size, PAGE_SIZE))
395+
if (!PAGE_ALIGNED(list[i].size))
342396
goto err;
343-
ubuf->pagecount += list[i].size >> PAGE_SHIFT;
344-
if (ubuf->pagecount > pglimit)
397+
398+
pgcnt += list[i].size >> PAGE_SHIFT;
399+
if (pgcnt > pglimit)
345400
goto err;
346401
}
347402

348-
if (!ubuf->pagecount)
403+
if (!pgcnt)
349404
goto err;
350405

351-
ubuf->folios = kvmalloc_array(ubuf->pagecount, sizeof(*ubuf->folios),
352-
GFP_KERNEL);
406+
ubuf->folios = kvmalloc_array(pgcnt, sizeof(*ubuf->folios), GFP_KERNEL);
353407
if (!ubuf->folios) {
354408
ret = -ENOMEM;
355409
goto err;
356410
}
357-
ubuf->offsets = kvcalloc(ubuf->pagecount, sizeof(*ubuf->offsets),
358-
GFP_KERNEL);
411+
412+
ubuf->offsets = kvcalloc(pgcnt, sizeof(*ubuf->offsets), GFP_KERNEL);
359413
if (!ubuf->offsets) {
360414
ret = -ENOMEM;
361415
goto err;
362416
}
363417

364-
pgbuf = 0;
365418
for (i = 0; i < head->count; i++) {
366-
memfd = fget(list[i].memfd);
367-
ret = check_memfd_seals(memfd);
368-
if (ret < 0)
369-
goto err;
419+
struct file *memfd = fget(list[i].memfd);
370420

371-
pgcnt = list[i].size >> PAGE_SHIFT;
372-
if (is_file_hugepages(memfd))
373-
ret = handle_hugetlb_pages(ubuf, memfd,
374-
list[i].offset,
375-
pgcnt, &pgbuf);
376-
else
377-
ret = handle_shmem_pages(ubuf, memfd,
378-
list[i].offset,
379-
pgcnt, &pgbuf);
380-
if (ret < 0)
421+
if (!memfd) {
422+
ret = -EBADFD;
381423
goto err;
424+
}
382425

426+
/*
427+
* Take the inode lock to protect against concurrent
428+
* memfd_add_seals(), which takes this lock in write mode.
429+
*/
430+
inode_lock_shared(file_inode(memfd));
431+
ret = check_memfd_seals(memfd);
432+
if (ret)
433+
goto out_unlock;
434+
435+
ret = udmabuf_pin_folios(ubuf, memfd, list[i].offset,
436+
list[i].size);
437+
out_unlock:
438+
inode_unlock_shared(file_inode(memfd));
383439
fput(memfd);
384-
memfd = NULL;
440+
if (ret)
441+
goto err;
385442
}
386443

387444
flags = head->flags & UDMABUF_FLAGS_CLOEXEC ? O_CLOEXEC : 0;
@@ -403,10 +460,7 @@ static long udmabuf_create(struct miscdevice *device,
403460
return ret;
404461

405462
err:
406-
while (pgbuf > 0)
407-
folio_put(ubuf->folios[--pgbuf]);
408-
if (memfd)
409-
fput(memfd);
463+
unpin_all_folios(&ubuf->unpin_list);
410464
kvfree(ubuf->offsets);
411465
kvfree(ubuf->folios);
412466
kfree(ubuf);

0 commit comments

Comments
 (0)