@@ -853,11 +853,98 @@ static struct kvm_memory_slot *gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu
853853 * About rmap_head encoding:
854854 *
855855 * If the bit zero of rmap_head->val is clear, then it points to the only spte
856- * in this rmap chain. Otherwise, (rmap_head->val & ~1 ) points to a struct
856+ * in this rmap chain. Otherwise, (rmap_head->val & ~3 ) points to a struct
857857 * pte_list_desc containing more mappings.
858858 */
859859#define KVM_RMAP_MANY BIT(0)
860860
861+ /*
862+ * rmaps and PTE lists are mostly protected by mmu_lock (the shadow MMU always
863+ * operates with mmu_lock held for write), but rmaps can be walked without
864+ * holding mmu_lock so long as the caller can tolerate SPTEs in the rmap chain
865+ * being zapped/dropped _while the rmap is locked_.
866+ *
867+ * Other than the KVM_RMAP_LOCKED flag, modifications to rmap entries must be
868+ * done while holding mmu_lock for write. This allows a task walking rmaps
869+ * without holding mmu_lock to concurrently walk the same entries as a task
870+ * that is holding mmu_lock but _not_ the rmap lock. Neither task will modify
871+ * the rmaps, thus the walks are stable.
872+ *
873+ * As alluded to above, SPTEs in rmaps are _not_ protected by KVM_RMAP_LOCKED,
874+ * only the rmap chains themselves are protected. E.g. holding an rmap's lock
875+ * ensures all "struct pte_list_desc" fields are stable.
876+ */
877+ #define KVM_RMAP_LOCKED BIT(1)
878+
879+ static unsigned long kvm_rmap_lock (struct kvm_rmap_head * rmap_head )
880+ {
881+ unsigned long old_val , new_val ;
882+
883+ lockdep_assert_preemption_disabled ();
884+
885+ /*
886+ * Elide the lock if the rmap is empty, as lockless walkers (read-only
887+ * mode) don't need to (and can't) walk an empty rmap, nor can they add
888+ * entries to the rmap. I.e. the only paths that process empty rmaps
889+ * do so while holding mmu_lock for write, and are mutually exclusive.
890+ */
891+ old_val = atomic_long_read (& rmap_head -> val );
892+ if (!old_val )
893+ return 0 ;
894+
895+ do {
896+ /*
897+ * If the rmap is locked, wait for it to be unlocked before
898+ * trying acquire the lock, e.g. to avoid bouncing the cache
899+ * line.
900+ */
901+ while (old_val & KVM_RMAP_LOCKED ) {
902+ cpu_relax ();
903+ old_val = atomic_long_read (& rmap_head -> val );
904+ }
905+
906+ /*
907+ * Recheck for an empty rmap, it may have been purged by the
908+ * task that held the lock.
909+ */
910+ if (!old_val )
911+ return 0 ;
912+
913+ new_val = old_val | KVM_RMAP_LOCKED ;
914+ /*
915+ * Use try_cmpxchg_acquire() to prevent reads and writes to the rmap
916+ * from being reordered outside of the critical section created by
917+ * kvm_rmap_lock().
918+ *
919+ * Pairs with the atomic_long_set_release() in kvm_rmap_unlock().
920+ *
921+ * For the !old_val case, no ordering is needed, as there is no rmap
922+ * to walk.
923+ */
924+ } while (!atomic_long_try_cmpxchg_acquire (& rmap_head -> val , & old_val , new_val ));
925+
926+ /* Return the old value, i.e. _without_ the LOCKED bit set. */
927+ return old_val ;
928+ }
929+
930+ static void kvm_rmap_unlock (struct kvm_rmap_head * rmap_head ,
931+ unsigned long new_val )
932+ {
933+ WARN_ON_ONCE (new_val & KVM_RMAP_LOCKED );
934+ /*
935+ * Ensure that all accesses to the rmap have completed before unlocking
936+ * the rmap.
937+ *
938+ * Pairs with the atomic_long_try_cmpxchg_acquire() in kvm_rmap_lock.
939+ */
940+ atomic_long_set_release (& rmap_head -> val , new_val );
941+ }
942+
943+ static unsigned long kvm_rmap_get (struct kvm_rmap_head * rmap_head )
944+ {
945+ return atomic_long_read (& rmap_head -> val ) & ~KVM_RMAP_LOCKED ;
946+ }
947+
861948/*
862949 * Returns the number of pointers in the rmap chain, not counting the new one.
863950 */
@@ -868,7 +955,7 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
868955 struct pte_list_desc * desc ;
869956 int count = 0 ;
870957
871- old_val = rmap_head -> val ;
958+ old_val = kvm_rmap_lock ( rmap_head ) ;
872959
873960 if (!old_val ) {
874961 new_val = (unsigned long )spte ;
@@ -900,7 +987,7 @@ static int pte_list_add(struct kvm_mmu_memory_cache *cache, u64 *spte,
900987 desc -> sptes [desc -> spte_count ++ ] = spte ;
901988 }
902989
903- rmap_head -> val = new_val ;
990+ kvm_rmap_unlock ( rmap_head , new_val ) ;
904991
905992 return count ;
906993}
@@ -948,7 +1035,7 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
9481035 unsigned long rmap_val ;
9491036 int i ;
9501037
951- rmap_val = rmap_head -> val ;
1038+ rmap_val = kvm_rmap_lock ( rmap_head ) ;
9521039 if (KVM_BUG_ON_DATA_CORRUPTION (!rmap_val , kvm ))
9531040 goto out ;
9541041
@@ -974,7 +1061,7 @@ static void pte_list_remove(struct kvm *kvm, u64 *spte,
9741061 }
9751062
9761063out :
977- rmap_head -> val = rmap_val ;
1064+ kvm_rmap_unlock ( rmap_head , rmap_val ) ;
9781065}
9791066
9801067static void kvm_zap_one_rmap_spte (struct kvm * kvm ,
@@ -992,7 +1079,7 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
9921079 unsigned long rmap_val ;
9931080 int i ;
9941081
995- rmap_val = rmap_head -> val ;
1082+ rmap_val = kvm_rmap_lock ( rmap_head ) ;
9961083 if (!rmap_val )
9971084 return false;
9981085
@@ -1011,13 +1098,13 @@ static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
10111098 }
10121099out :
10131100 /* rmap_head is meaningless now, remember to reset it */
1014- rmap_head -> val = 0 ;
1101+ kvm_rmap_unlock ( rmap_head , 0 ) ;
10151102 return true;
10161103}
10171104
10181105unsigned int pte_list_count (struct kvm_rmap_head * rmap_head )
10191106{
1020- unsigned long rmap_val = rmap_head -> val ;
1107+ unsigned long rmap_val = kvm_rmap_get ( rmap_head ) ;
10211108 struct pte_list_desc * desc ;
10221109
10231110 if (!rmap_val )
@@ -1083,7 +1170,7 @@ struct rmap_iterator {
10831170static u64 * rmap_get_first (struct kvm_rmap_head * rmap_head ,
10841171 struct rmap_iterator * iter )
10851172{
1086- unsigned long rmap_val = rmap_head -> val ;
1173+ unsigned long rmap_val = kvm_rmap_get ( rmap_head ) ;
10871174 u64 * sptep ;
10881175
10891176 if (!rmap_val )
@@ -1418,7 +1505,7 @@ static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
14181505 while (++ iterator -> rmap <= iterator -> end_rmap ) {
14191506 iterator -> gfn += KVM_PAGES_PER_HPAGE (iterator -> level );
14201507
1421- if (iterator -> rmap -> val )
1508+ if (atomic_long_read ( & iterator -> rmap -> val ) )
14221509 return ;
14231510 }
14241511
@@ -2444,7 +2531,8 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
24442531 * avoids retaining a large number of stale nested SPs.
24452532 */
24462533 if (tdp_enabled && invalid_list &&
2447- child -> role .guest_mode && !child -> parent_ptes .val )
2534+ child -> role .guest_mode &&
2535+ !atomic_long_read (& child -> parent_ptes .val ))
24482536 return kvm_mmu_prepare_zap_page (kvm , child ,
24492537 invalid_list );
24502538 }
0 commit comments