@@ -31,6 +31,8 @@ static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu);
3131 */
3232static DEFINE_PER_CPU (raw_spinlock_t , wakeup_vcpus_on_cpu_lock ) ;
3333
34+ #define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING
35+
3436static inline struct pi_desc * vcpu_to_pi_desc (struct kvm_vcpu * vcpu )
3537{
3638 return & (to_vmx (vcpu )-> pi_desc );
@@ -89,9 +91,20 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
8991 * current pCPU if the task was migrated.
9092 */
9193 if (pi_desc -> nv == POSTED_INTR_WAKEUP_VECTOR ) {
92- raw_spin_lock (& per_cpu (wakeup_vcpus_on_cpu_lock , vcpu -> cpu ));
94+ raw_spinlock_t * spinlock = & per_cpu (wakeup_vcpus_on_cpu_lock , vcpu -> cpu );
95+
96+ /*
97+ * In addition to taking the wakeup lock for the regular/IRQ
98+ * context, tell lockdep it is being taken for the "sched out"
99+ * context as well. vCPU loads happens in task context, and
100+ * this is taking the lock of the *previous* CPU, i.e. can race
101+ * with both the scheduler and the wakeup handler.
102+ */
103+ raw_spin_lock (spinlock );
104+ spin_acquire (& spinlock -> dep_map , PI_LOCK_SCHED_OUT , 0 , _RET_IP_ );
93105 list_del (& vmx -> pi_wakeup_list );
94- raw_spin_unlock (& per_cpu (wakeup_vcpus_on_cpu_lock , vcpu -> cpu ));
106+ spin_release (& spinlock -> dep_map , _RET_IP_ );
107+ raw_spin_unlock (spinlock );
95108 }
96109
97110 dest = cpu_physical_id (cpu );
@@ -148,11 +161,23 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
148161 struct pi_desc * pi_desc = vcpu_to_pi_desc (vcpu );
149162 struct vcpu_vmx * vmx = to_vmx (vcpu );
150163 struct pi_desc old , new ;
151- unsigned long flags ;
152164
153- local_irq_save ( flags );
165+ lockdep_assert_irqs_disabled ( );
154166
155- raw_spin_lock (& per_cpu (wakeup_vcpus_on_cpu_lock , vcpu -> cpu ));
167+ /*
168+ * Acquire the wakeup lock using the "sched out" context to workaround
169+ * a lockdep false positive. When this is called, schedule() holds
170+ * various per-CPU scheduler locks. When the wakeup handler runs, it
171+ * holds this CPU's wakeup lock while calling try_to_wake_up(), which
172+ * can eventually take the aforementioned scheduler locks, which causes
173+ * lockdep to assume there is deadlock.
174+ *
175+ * Deadlock can't actually occur because IRQs are disabled for the
176+ * entirety of the sched_out critical section, i.e. the wakeup handler
177+ * can't run while the scheduler locks are held.
178+ */
179+ raw_spin_lock_nested (& per_cpu (wakeup_vcpus_on_cpu_lock , vcpu -> cpu ),
180+ PI_LOCK_SCHED_OUT );
156181 list_add_tail (& vmx -> pi_wakeup_list ,
157182 & per_cpu (wakeup_vcpus_on_cpu , vcpu -> cpu ));
158183 raw_spin_unlock (& per_cpu (wakeup_vcpus_on_cpu_lock , vcpu -> cpu ));
@@ -176,8 +201,6 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
176201 */
177202 if (pi_test_on (& new ))
178203 __apic_send_IPI_self (POSTED_INTR_WAKEUP_VECTOR );
179-
180- local_irq_restore (flags );
181204}
182205
183206static bool vmx_needs_pi_wakeup (struct kvm_vcpu * vcpu )
0 commit comments