@@ -30,22 +30,57 @@ struct udmabuf {
3030 struct sg_table * sg ;
3131 struct miscdevice * device ;
3232 pgoff_t * offsets ;
33+ struct list_head unpin_list ;
34+ };
35+
36+ struct udmabuf_folio {
37+ struct folio * folio ;
38+ struct list_head list ;
3339};
3440
3541static vm_fault_t udmabuf_vm_fault (struct vm_fault * vmf )
3642{
3743 struct vm_area_struct * vma = vmf -> vma ;
3844 struct udmabuf * ubuf = vma -> vm_private_data ;
3945 pgoff_t pgoff = vmf -> pgoff ;
40- unsigned long pfn ;
46+ unsigned long addr , pfn ;
47+ vm_fault_t ret ;
4148
4249 if (pgoff >= ubuf -> pagecount )
4350 return VM_FAULT_SIGBUS ;
4451
4552 pfn = folio_pfn (ubuf -> folios [pgoff ]);
4653 pfn += ubuf -> offsets [pgoff ] >> PAGE_SHIFT ;
4754
48- return vmf_insert_pfn (vma , vmf -> address , pfn );
55+ ret = vmf_insert_pfn (vma , vmf -> address , pfn );
56+ if (ret & VM_FAULT_ERROR )
57+ return ret ;
58+
59+ /* pre fault */
60+ pgoff = vma -> vm_pgoff ;
61+ addr = vma -> vm_start ;
62+
63+ for (; addr < vma -> vm_end ; pgoff ++ , addr += PAGE_SIZE ) {
64+ if (addr == vmf -> address )
65+ continue ;
66+
67+ if (WARN_ON (pgoff >= ubuf -> pagecount ))
68+ break ;
69+
70+ pfn = folio_pfn (ubuf -> folios [pgoff ]);
71+ pfn += ubuf -> offsets [pgoff ] >> PAGE_SHIFT ;
72+
73+ /**
74+ * If the below vmf_insert_pfn() fails, we do not return an
75+ * error here during this pre-fault step. However, an error
76+ * will be returned if the failure occurs when the addr is
77+ * truly accessed.
78+ */
79+ if (vmf_insert_pfn (vma , addr , pfn ) & VM_FAULT_ERROR )
80+ break ;
81+ }
82+
83+ return ret ;
4984}
5085
5186static const struct vm_operations_struct udmabuf_vm_ops = {
@@ -161,17 +196,43 @@ static void unmap_udmabuf(struct dma_buf_attachment *at,
161196 return put_sg_table (at -> dev , sg , direction );
162197}
163198
199+ static void unpin_all_folios (struct list_head * unpin_list )
200+ {
201+ struct udmabuf_folio * ubuf_folio ;
202+
203+ while (!list_empty (unpin_list )) {
204+ ubuf_folio = list_first_entry (unpin_list ,
205+ struct udmabuf_folio , list );
206+ unpin_folio (ubuf_folio -> folio );
207+
208+ list_del (& ubuf_folio -> list );
209+ kfree (ubuf_folio );
210+ }
211+ }
212+
213+ static int add_to_unpin_list (struct list_head * unpin_list ,
214+ struct folio * folio )
215+ {
216+ struct udmabuf_folio * ubuf_folio ;
217+
218+ ubuf_folio = kzalloc (sizeof (* ubuf_folio ), GFP_KERNEL );
219+ if (!ubuf_folio )
220+ return - ENOMEM ;
221+
222+ ubuf_folio -> folio = folio ;
223+ list_add_tail (& ubuf_folio -> list , unpin_list );
224+ return 0 ;
225+ }
226+
164227static void release_udmabuf (struct dma_buf * buf )
165228{
166229 struct udmabuf * ubuf = buf -> priv ;
167230 struct device * dev = ubuf -> device -> this_device ;
168- pgoff_t pg ;
169231
170232 if (ubuf -> sg )
171233 put_sg_table (dev , ubuf -> sg , DMA_BIDIRECTIONAL );
172234
173- for (pg = 0 ; pg < ubuf -> pagecount ; pg ++ )
174- folio_put (ubuf -> folios [pg ]);
235+ unpin_all_folios (& ubuf -> unpin_list );
175236 kvfree (ubuf -> offsets );
176237 kvfree (ubuf -> folios );
177238 kfree (ubuf );
@@ -226,71 +287,10 @@ static const struct dma_buf_ops udmabuf_ops = {
226287#define SEALS_WANTED (F_SEAL_SHRINK)
227288#define SEALS_DENIED (F_SEAL_WRITE|F_SEAL_FUTURE_WRITE)
228289
229- static int handle_hugetlb_pages (struct udmabuf * ubuf , struct file * memfd ,
230- pgoff_t offset , pgoff_t pgcnt ,
231- pgoff_t * pgbuf )
232- {
233- struct hstate * hpstate = hstate_file (memfd );
234- pgoff_t mapidx = offset >> huge_page_shift (hpstate );
235- pgoff_t subpgoff = (offset & ~huge_page_mask (hpstate )) >> PAGE_SHIFT ;
236- pgoff_t maxsubpgs = huge_page_size (hpstate ) >> PAGE_SHIFT ;
237- struct folio * folio = NULL ;
238- pgoff_t pgidx ;
239-
240- mapidx <<= huge_page_order (hpstate );
241- for (pgidx = 0 ; pgidx < pgcnt ; pgidx ++ ) {
242- if (!folio ) {
243- folio = __filemap_get_folio (memfd -> f_mapping ,
244- mapidx ,
245- FGP_ACCESSED , 0 );
246- if (IS_ERR (folio ))
247- return PTR_ERR (folio );
248- }
249-
250- folio_get (folio );
251- ubuf -> folios [* pgbuf ] = folio ;
252- ubuf -> offsets [* pgbuf ] = subpgoff << PAGE_SHIFT ;
253- (* pgbuf )++ ;
254- if (++ subpgoff == maxsubpgs ) {
255- folio_put (folio );
256- folio = NULL ;
257- subpgoff = 0 ;
258- mapidx += pages_per_huge_page (hpstate );
259- }
260- }
261-
262- if (folio )
263- folio_put (folio );
264-
265- return 0 ;
266- }
267-
268- static int handle_shmem_pages (struct udmabuf * ubuf , struct file * memfd ,
269- pgoff_t offset , pgoff_t pgcnt ,
270- pgoff_t * pgbuf )
271- {
272- pgoff_t pgidx , pgoff = offset >> PAGE_SHIFT ;
273- struct folio * folio = NULL ;
274-
275- for (pgidx = 0 ; pgidx < pgcnt ; pgidx ++ ) {
276- folio = shmem_read_folio (memfd -> f_mapping , pgoff + pgidx );
277- if (IS_ERR (folio ))
278- return PTR_ERR (folio );
279-
280- ubuf -> folios [* pgbuf ] = folio ;
281- (* pgbuf )++ ;
282- }
283-
284- return 0 ;
285- }
286-
287290static int check_memfd_seals (struct file * memfd )
288291{
289292 int seals ;
290293
291- if (!memfd )
292- return - EBADFD ;
293-
294294 if (!shmem_file (memfd ) && !is_file_hugepages (memfd ))
295295 return - EBADFD ;
296296
@@ -319,69 +319,126 @@ static struct dma_buf *export_udmabuf(struct udmabuf *ubuf,
319319 return dma_buf_export (& exp_info );
320320}
321321
322+ static long udmabuf_pin_folios (struct udmabuf * ubuf , struct file * memfd ,
323+ loff_t start , loff_t size )
324+ {
325+ pgoff_t pgoff , pgcnt , upgcnt = ubuf -> pagecount ;
326+ struct folio * * folios = NULL ;
327+ u32 cur_folio , cur_pgcnt ;
328+ long nr_folios ;
329+ long ret = 0 ;
330+ loff_t end ;
331+
332+ pgcnt = size >> PAGE_SHIFT ;
333+ folios = kvmalloc_array (pgcnt , sizeof (* folios ), GFP_KERNEL );
334+ if (!folios )
335+ return - ENOMEM ;
336+
337+ end = start + (pgcnt << PAGE_SHIFT ) - 1 ;
338+ nr_folios = memfd_pin_folios (memfd , start , end , folios , pgcnt , & pgoff );
339+ if (nr_folios <= 0 ) {
340+ ret = nr_folios ? nr_folios : - EINVAL ;
341+ goto end ;
342+ }
343+
344+ cur_pgcnt = 0 ;
345+ for (cur_folio = 0 ; cur_folio < nr_folios ; ++ cur_folio ) {
346+ pgoff_t subpgoff = pgoff ;
347+ size_t fsize = folio_size (folios [cur_folio ]);
348+
349+ ret = add_to_unpin_list (& ubuf -> unpin_list , folios [cur_folio ]);
350+ if (ret < 0 )
351+ goto end ;
352+
353+ for (; subpgoff < fsize ; subpgoff += PAGE_SIZE ) {
354+ ubuf -> folios [upgcnt ] = folios [cur_folio ];
355+ ubuf -> offsets [upgcnt ] = subpgoff ;
356+ ++ upgcnt ;
357+
358+ if (++ cur_pgcnt >= pgcnt )
359+ goto end ;
360+ }
361+
362+ /**
363+ * In a given range, only the first subpage of the first folio
364+ * has an offset, that is returned by memfd_pin_folios().
365+ * The first subpages of other folios (in the range) have an
366+ * offset of 0.
367+ */
368+ pgoff = 0 ;
369+ }
370+ end :
371+ ubuf -> pagecount = upgcnt ;
372+ kvfree (folios );
373+ return ret ;
374+ }
375+
322376static long udmabuf_create (struct miscdevice * device ,
323377 struct udmabuf_create_list * head ,
324378 struct udmabuf_create_item * list )
325379{
326- pgoff_t pgcnt , pgbuf = 0 , pglimit ;
327- struct file * memfd = NULL ;
380+ pgoff_t pgcnt = 0 , pglimit ;
328381 struct udmabuf * ubuf ;
329382 struct dma_buf * dmabuf ;
330- int ret = - EINVAL ;
383+ long ret = - EINVAL ;
331384 u32 i , flags ;
332385
333386 ubuf = kzalloc (sizeof (* ubuf ), GFP_KERNEL );
334387 if (!ubuf )
335388 return - ENOMEM ;
336389
390+ INIT_LIST_HEAD (& ubuf -> unpin_list );
337391 pglimit = (size_limit_mb * 1024 * 1024 ) >> PAGE_SHIFT ;
338392 for (i = 0 ; i < head -> count ; i ++ ) {
339- if (!IS_ALIGNED (list [i ].offset , PAGE_SIZE ))
393+ if (!PAGE_ALIGNED (list [i ].offset ))
340394 goto err ;
341- if (!IS_ALIGNED (list [i ].size , PAGE_SIZE ))
395+ if (!PAGE_ALIGNED (list [i ].size ))
342396 goto err ;
343- ubuf -> pagecount += list [i ].size >> PAGE_SHIFT ;
344- if (ubuf -> pagecount > pglimit )
397+
398+ pgcnt += list [i ].size >> PAGE_SHIFT ;
399+ if (pgcnt > pglimit )
345400 goto err ;
346401 }
347402
348- if (!ubuf -> pagecount )
403+ if (!pgcnt )
349404 goto err ;
350405
351- ubuf -> folios = kvmalloc_array (ubuf -> pagecount , sizeof (* ubuf -> folios ),
352- GFP_KERNEL );
406+ ubuf -> folios = kvmalloc_array (pgcnt , sizeof (* ubuf -> folios ), GFP_KERNEL );
353407 if (!ubuf -> folios ) {
354408 ret = - ENOMEM ;
355409 goto err ;
356410 }
357- ubuf -> offsets = kvcalloc ( ubuf -> pagecount , sizeof ( * ubuf -> offsets ),
358- GFP_KERNEL );
411+
412+ ubuf -> offsets = kvcalloc ( pgcnt , sizeof ( * ubuf -> offsets ), GFP_KERNEL );
359413 if (!ubuf -> offsets ) {
360414 ret = - ENOMEM ;
361415 goto err ;
362416 }
363417
364- pgbuf = 0 ;
365418 for (i = 0 ; i < head -> count ; i ++ ) {
366- memfd = fget (list [i ].memfd );
367- ret = check_memfd_seals (memfd );
368- if (ret < 0 )
369- goto err ;
419+ struct file * memfd = fget (list [i ].memfd );
370420
371- pgcnt = list [i ].size >> PAGE_SHIFT ;
372- if (is_file_hugepages (memfd ))
373- ret = handle_hugetlb_pages (ubuf , memfd ,
374- list [i ].offset ,
375- pgcnt , & pgbuf );
376- else
377- ret = handle_shmem_pages (ubuf , memfd ,
378- list [i ].offset ,
379- pgcnt , & pgbuf );
380- if (ret < 0 )
421+ if (!memfd ) {
422+ ret = - EBADFD ;
381423 goto err ;
424+ }
382425
426+ /*
427+ * Take the inode lock to protect against concurrent
428+ * memfd_add_seals(), which takes this lock in write mode.
429+ */
430+ inode_lock_shared (file_inode (memfd ));
431+ ret = check_memfd_seals (memfd );
432+ if (ret )
433+ goto out_unlock ;
434+
435+ ret = udmabuf_pin_folios (ubuf , memfd , list [i ].offset ,
436+ list [i ].size );
437+ out_unlock :
438+ inode_unlock_shared (file_inode (memfd ));
383439 fput (memfd );
384- memfd = NULL ;
440+ if (ret )
441+ goto err ;
385442 }
386443
387444 flags = head -> flags & UDMABUF_FLAGS_CLOEXEC ? O_CLOEXEC : 0 ;
@@ -403,10 +460,7 @@ static long udmabuf_create(struct miscdevice *device,
403460 return ret ;
404461
405462err :
406- while (pgbuf > 0 )
407- folio_put (ubuf -> folios [-- pgbuf ]);
408- if (memfd )
409- fput (memfd );
463+ unpin_all_folios (& ubuf -> unpin_list );
410464 kvfree (ubuf -> offsets );
411465 kvfree (ubuf -> folios );
412466 kfree (ubuf );
0 commit comments