@@ -1610,100 +1610,20 @@ ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *bu
16101610}
16111611EXPORT_SYMBOL_GPL (vfio_pci_core_write );
16121612
1613- /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
1614- static int vfio_pci_zap_and_vma_lock (struct vfio_pci_core_device * vdev , bool try )
1613+ static void vfio_pci_zap_bars (struct vfio_pci_core_device * vdev )
16151614{
1616- struct vfio_pci_mmap_vma * mmap_vma , * tmp ;
1615+ struct vfio_device * core_vdev = & vdev -> vdev ;
1616+ loff_t start = VFIO_PCI_INDEX_TO_OFFSET (VFIO_PCI_BAR0_REGION_INDEX );
1617+ loff_t end = VFIO_PCI_INDEX_TO_OFFSET (VFIO_PCI_ROM_REGION_INDEX );
1618+ loff_t len = end - start ;
16171619
1618- /*
1619- * Lock ordering:
1620- * vma_lock is nested under mmap_lock for vm_ops callback paths.
1621- * The memory_lock semaphore is used by both code paths calling
1622- * into this function to zap vmas and the vm_ops.fault callback
1623- * to protect the memory enable state of the device.
1624- *
1625- * When zapping vmas we need to maintain the mmap_lock => vma_lock
1626- * ordering, which requires using vma_lock to walk vma_list to
1627- * acquire an mm, then dropping vma_lock to get the mmap_lock and
1628- * reacquiring vma_lock. This logic is derived from similar
1629- * requirements in uverbs_user_mmap_disassociate().
1630- *
1631- * mmap_lock must always be the top-level lock when it is taken.
1632- * Therefore we can only hold the memory_lock write lock when
1633- * vma_list is empty, as we'd need to take mmap_lock to clear
1634- * entries. vma_list can only be guaranteed empty when holding
1635- * vma_lock, thus memory_lock is nested under vma_lock.
1636- *
1637- * This enables the vm_ops.fault callback to acquire vma_lock,
1638- * followed by memory_lock read lock, while already holding
1639- * mmap_lock without risk of deadlock.
1640- */
1641- while (1 ) {
1642- struct mm_struct * mm = NULL ;
1643-
1644- if (try ) {
1645- if (!mutex_trylock (& vdev -> vma_lock ))
1646- return 0 ;
1647- } else {
1648- mutex_lock (& vdev -> vma_lock );
1649- }
1650- while (!list_empty (& vdev -> vma_list )) {
1651- mmap_vma = list_first_entry (& vdev -> vma_list ,
1652- struct vfio_pci_mmap_vma ,
1653- vma_next );
1654- mm = mmap_vma -> vma -> vm_mm ;
1655- if (mmget_not_zero (mm ))
1656- break ;
1657-
1658- list_del (& mmap_vma -> vma_next );
1659- kfree (mmap_vma );
1660- mm = NULL ;
1661- }
1662- if (!mm )
1663- return 1 ;
1664- mutex_unlock (& vdev -> vma_lock );
1665-
1666- if (try ) {
1667- if (!mmap_read_trylock (mm )) {
1668- mmput (mm );
1669- return 0 ;
1670- }
1671- } else {
1672- mmap_read_lock (mm );
1673- }
1674- if (try ) {
1675- if (!mutex_trylock (& vdev -> vma_lock )) {
1676- mmap_read_unlock (mm );
1677- mmput (mm );
1678- return 0 ;
1679- }
1680- } else {
1681- mutex_lock (& vdev -> vma_lock );
1682- }
1683- list_for_each_entry_safe (mmap_vma , tmp ,
1684- & vdev -> vma_list , vma_next ) {
1685- struct vm_area_struct * vma = mmap_vma -> vma ;
1686-
1687- if (vma -> vm_mm != mm )
1688- continue ;
1689-
1690- list_del (& mmap_vma -> vma_next );
1691- kfree (mmap_vma );
1692-
1693- zap_vma_ptes (vma , vma -> vm_start ,
1694- vma -> vm_end - vma -> vm_start );
1695- }
1696- mutex_unlock (& vdev -> vma_lock );
1697- mmap_read_unlock (mm );
1698- mmput (mm );
1699- }
1620+ unmap_mapping_range (core_vdev -> inode -> i_mapping , start , len , true);
17001621}
17011622
17021623void vfio_pci_zap_and_down_write_memory_lock (struct vfio_pci_core_device * vdev )
17031624{
1704- vfio_pci_zap_and_vma_lock (vdev , false);
17051625 down_write (& vdev -> memory_lock );
1706- mutex_unlock ( & vdev -> vma_lock );
1626+ vfio_pci_zap_bars ( vdev );
17071627}
17081628
17091629u16 vfio_pci_memory_lock_and_enable (struct vfio_pci_core_device * vdev )
@@ -1725,99 +1645,41 @@ void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 c
17251645 up_write (& vdev -> memory_lock );
17261646}
17271647
1728- /* Caller holds vma_lock */
1729- static int __vfio_pci_add_vma (struct vfio_pci_core_device * vdev ,
1730- struct vm_area_struct * vma )
1731- {
1732- struct vfio_pci_mmap_vma * mmap_vma ;
1733-
1734- mmap_vma = kmalloc (sizeof (* mmap_vma ), GFP_KERNEL_ACCOUNT );
1735- if (!mmap_vma )
1736- return - ENOMEM ;
1737-
1738- mmap_vma -> vma = vma ;
1739- list_add (& mmap_vma -> vma_next , & vdev -> vma_list );
1740-
1741- return 0 ;
1742- }
1743-
1744- /*
1745- * Zap mmaps on open so that we can fault them in on access and therefore
1746- * our vma_list only tracks mappings accessed since last zap.
1747- */
1748- static void vfio_pci_mmap_open (struct vm_area_struct * vma )
1749- {
1750- zap_vma_ptes (vma , vma -> vm_start , vma -> vm_end - vma -> vm_start );
1751- }
1752-
1753- static void vfio_pci_mmap_close (struct vm_area_struct * vma )
1648+ static unsigned long vma_to_pfn (struct vm_area_struct * vma )
17541649{
17551650 struct vfio_pci_core_device * vdev = vma -> vm_private_data ;
1756- struct vfio_pci_mmap_vma * mmap_vma ;
1651+ int index = vma -> vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT );
1652+ u64 pgoff ;
17571653
1758- mutex_lock (& vdev -> vma_lock );
1759- list_for_each_entry (mmap_vma , & vdev -> vma_list , vma_next ) {
1760- if (mmap_vma -> vma == vma ) {
1761- list_del (& mmap_vma -> vma_next );
1762- kfree (mmap_vma );
1763- break ;
1764- }
1765- }
1766- mutex_unlock (& vdev -> vma_lock );
1654+ pgoff = vma -> vm_pgoff &
1655+ ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT )) - 1 );
1656+
1657+ return (pci_resource_start (vdev -> pdev , index ) >> PAGE_SHIFT ) + pgoff ;
17671658}
17681659
17691660static vm_fault_t vfio_pci_mmap_fault (struct vm_fault * vmf )
17701661{
17711662 struct vm_area_struct * vma = vmf -> vma ;
17721663 struct vfio_pci_core_device * vdev = vma -> vm_private_data ;
1773- struct vfio_pci_mmap_vma * mmap_vma ;
1774- vm_fault_t ret = VM_FAULT_NOPAGE ;
1664+ unsigned long pfn , pgoff = vmf -> pgoff - vma -> vm_pgoff ;
1665+ vm_fault_t ret = VM_FAULT_SIGBUS ;
17751666
1776- mutex_lock (& vdev -> vma_lock );
1777- down_read (& vdev -> memory_lock );
1667+ pfn = vma_to_pfn (vma );
17781668
1779- /*
1780- * Memory region cannot be accessed if the low power feature is engaged
1781- * or memory access is disabled.
1782- */
1783- if (vdev -> pm_runtime_engaged || !__vfio_pci_memory_enabled (vdev )) {
1784- ret = VM_FAULT_SIGBUS ;
1785- goto up_out ;
1786- }
1669+ down_read (& vdev -> memory_lock );
17871670
1788- /*
1789- * We populate the whole vma on fault, so we need to test whether
1790- * the vma has already been mapped, such as for concurrent faults
1791- * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if
1792- * we ask it to fill the same range again.
1793- */
1794- list_for_each_entry (mmap_vma , & vdev -> vma_list , vma_next ) {
1795- if (mmap_vma -> vma == vma )
1796- goto up_out ;
1797- }
1671+ if (vdev -> pm_runtime_engaged || !__vfio_pci_memory_enabled (vdev ))
1672+ goto out_disabled ;
17981673
1799- if (io_remap_pfn_range (vma , vma -> vm_start , vma -> vm_pgoff ,
1800- vma -> vm_end - vma -> vm_start ,
1801- vma -> vm_page_prot )) {
1802- ret = VM_FAULT_SIGBUS ;
1803- zap_vma_ptes (vma , vma -> vm_start , vma -> vm_end - vma -> vm_start );
1804- goto up_out ;
1805- }
1674+ ret = vmf_insert_pfn (vma , vmf -> address , pfn + pgoff );
18061675
1807- if (__vfio_pci_add_vma (vdev , vma )) {
1808- ret = VM_FAULT_OOM ;
1809- zap_vma_ptes (vma , vma -> vm_start , vma -> vm_end - vma -> vm_start );
1810- }
1811-
1812- up_out :
1676+ out_disabled :
18131677 up_read (& vdev -> memory_lock );
1814- mutex_unlock ( & vdev -> vma_lock );
1678+
18151679 return ret ;
18161680}
18171681
18181682static const struct vm_operations_struct vfio_pci_mmap_ops = {
1819- .open = vfio_pci_mmap_open ,
1820- .close = vfio_pci_mmap_close ,
18211683 .fault = vfio_pci_mmap_fault ,
18221684};
18231685
@@ -1880,11 +1742,12 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
18801742
18811743 vma -> vm_private_data = vdev ;
18821744 vma -> vm_page_prot = pgprot_noncached (vma -> vm_page_prot );
1883- vma -> vm_pgoff = ( pci_resource_start ( pdev , index ) >> PAGE_SHIFT ) + pgoff ;
1745+ vma -> vm_page_prot = pgprot_decrypted ( vma -> vm_page_prot ) ;
18841746
18851747 /*
1886- * See remap_pfn_range(), called from vfio_pci_fault() but we can't
1887- * change vm_flags within the fault handler. Set them now.
1748+ * Set vm_flags now, they should not be changed in the fault handler.
1749+ * We want the same flags and page protection (decrypted above) as
1750+ * io_remap_pfn_range() would set.
18881751 *
18891752 * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64,
18901753 * allowing KVM stage 2 device mapping attributes to use Normal-NC
@@ -2202,8 +2065,6 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
22022065 mutex_init (& vdev -> ioeventfds_lock );
22032066 INIT_LIST_HEAD (& vdev -> dummy_resources_list );
22042067 INIT_LIST_HEAD (& vdev -> ioeventfds_list );
2205- mutex_init (& vdev -> vma_lock );
2206- INIT_LIST_HEAD (& vdev -> vma_list );
22072068 INIT_LIST_HEAD (& vdev -> sriov_pfs_item );
22082069 init_rwsem (& vdev -> memory_lock );
22092070 xa_init (& vdev -> ctx );
@@ -2219,7 +2080,6 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev)
22192080
22202081 mutex_destroy (& vdev -> igate );
22212082 mutex_destroy (& vdev -> ioeventfds_lock );
2222- mutex_destroy (& vdev -> vma_lock );
22232083 kfree (vdev -> region );
22242084 kfree (vdev -> pm_save );
22252085}
@@ -2497,26 +2357,15 @@ static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
24972357 return ret ;
24982358}
24992359
2500- /*
2501- * We need to get memory_lock for each device, but devices can share mmap_lock,
2502- * therefore we need to zap and hold the vma_lock for each device, and only then
2503- * get each memory_lock.
2504- */
25052360static int vfio_pci_dev_set_hot_reset (struct vfio_device_set * dev_set ,
25062361 struct vfio_pci_group_info * groups ,
25072362 struct iommufd_ctx * iommufd_ctx )
25082363{
2509- struct vfio_pci_core_device * cur_mem ;
2510- struct vfio_pci_core_device * cur_vma ;
2511- struct vfio_pci_core_device * cur ;
2364+ struct vfio_pci_core_device * vdev ;
25122365 struct pci_dev * pdev ;
2513- bool is_mem = true;
25142366 int ret ;
25152367
25162368 mutex_lock (& dev_set -> lock );
2517- cur_mem = list_first_entry (& dev_set -> device_list ,
2518- struct vfio_pci_core_device ,
2519- vdev .dev_set_list );
25202369
25212370 pdev = vfio_pci_dev_set_resettable (dev_set );
25222371 if (!pdev ) {
@@ -2533,7 +2382,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
25332382 if (ret )
25342383 goto err_unlock ;
25352384
2536- list_for_each_entry (cur_vma , & dev_set -> device_list , vdev .dev_set_list ) {
2385+ list_for_each_entry (vdev , & dev_set -> device_list , vdev .dev_set_list ) {
25372386 bool owned ;
25382387
25392388 /*
@@ -2557,38 +2406,38 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
25572406 * Otherwise, reset is not allowed.
25582407 */
25592408 if (iommufd_ctx ) {
2560- int devid = vfio_iommufd_get_dev_id (& cur_vma -> vdev ,
2409+ int devid = vfio_iommufd_get_dev_id (& vdev -> vdev ,
25612410 iommufd_ctx );
25622411
25632412 owned = (devid > 0 || devid == - ENOENT );
25642413 } else {
2565- owned = vfio_dev_in_groups (& cur_vma -> vdev , groups );
2414+ owned = vfio_dev_in_groups (& vdev -> vdev , groups );
25662415 }
25672416
25682417 if (!owned ) {
25692418 ret = - EINVAL ;
2570- goto err_undo ;
2419+ break ;
25712420 }
25722421
25732422 /*
2574- * Locking multiple devices is prone to deadlock, runaway and
2575- * unwind if we hit contention.
2423+ * Take the memory write lock for each device and zap BAR
2424+ * mappings to prevent the user accessing the device while in
2425+ * reset. Locking multiple devices is prone to deadlock,
2426+ * runaway and unwind if we hit contention.
25762427 */
2577- if (!vfio_pci_zap_and_vma_lock ( cur_vma , true )) {
2428+ if (!down_write_trylock ( & vdev -> memory_lock )) {
25782429 ret = - EBUSY ;
2579- goto err_undo ;
2430+ break ;
25802431 }
2432+
2433+ vfio_pci_zap_bars (vdev );
25812434 }
2582- cur_vma = NULL ;
25832435
2584- list_for_each_entry (cur_mem , & dev_set -> device_list , vdev .dev_set_list ) {
2585- if (!down_write_trylock (& cur_mem -> memory_lock )) {
2586- ret = - EBUSY ;
2587- goto err_undo ;
2588- }
2589- mutex_unlock (& cur_mem -> vma_lock );
2436+ if (!list_entry_is_head (vdev ,
2437+ & dev_set -> device_list , vdev .dev_set_list )) {
2438+ vdev = list_prev_entry (vdev , vdev .dev_set_list );
2439+ goto err_undo ;
25902440 }
2591- cur_mem = NULL ;
25922441
25932442 /*
25942443 * The pci_reset_bus() will reset all the devices in the bus.
@@ -2599,25 +2448,22 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
25992448 * cause the PCI config space reset without restoring the original
26002449 * state (saved locally in 'vdev->pm_save').
26012450 */
2602- list_for_each_entry (cur , & dev_set -> device_list , vdev .dev_set_list )
2603- vfio_pci_set_power_state (cur , PCI_D0 );
2451+ list_for_each_entry (vdev , & dev_set -> device_list , vdev .dev_set_list )
2452+ vfio_pci_set_power_state (vdev , PCI_D0 );
26042453
26052454 ret = pci_reset_bus (pdev );
26062455
2456+ vdev = list_last_entry (& dev_set -> device_list ,
2457+ struct vfio_pci_core_device , vdev .dev_set_list );
2458+
26072459err_undo :
2608- list_for_each_entry (cur , & dev_set -> device_list , vdev .dev_set_list ) {
2609- if (cur == cur_mem )
2610- is_mem = false;
2611- if (cur == cur_vma )
2612- break ;
2613- if (is_mem )
2614- up_write (& cur -> memory_lock );
2615- else
2616- mutex_unlock (& cur -> vma_lock );
2617- }
2460+ list_for_each_entry_from_reverse (vdev , & dev_set -> device_list ,
2461+ vdev .dev_set_list )
2462+ up_write (& vdev -> memory_lock );
2463+
2464+ list_for_each_entry (vdev , & dev_set -> device_list , vdev .dev_set_list )
2465+ pm_runtime_put (& vdev -> pdev -> dev );
26182466
2619- list_for_each_entry (cur , & dev_set -> device_list , vdev .dev_set_list )
2620- pm_runtime_put (& cur -> pdev -> dev );
26212467err_unlock :
26222468 mutex_unlock (& dev_set -> lock );
26232469 return ret ;
0 commit comments