auto merge of #10817 : alexcrichton/rust/sched-fix, r=brson

bors · bors · commit ad6f6cb589f1 · 2013-12-05T14:01:46.000-08:00
Right now, as pointed out in #8132, it is very easy to introduce a subtle race in the runtime. I believe that this is the cause of the current flakiness on the bots. I have taken the last idea mentioned in that issue which is to use a lock around descheduling and context switching in order to solve this race. Closes #8132
diff --git a/src/libstd/rt/sched.rs b/src/libstd/rt/sched.rs
@@ -27,8 +27,10 @@ use borrow::{to_uint};
 use cell::Cell;
 use rand::{XorShiftRng, Rng, Rand};
 use iter::range;
+use unstable::mutex::Mutex;
 use vec::{OwnedVector};
 
+
 /// A scheduler is responsible for coordinating the execution of Tasks
 /// on a single thread. The scheduler runs inside a slightly modified
 /// Rust Task. When not running this task is stored in the scheduler
@@ -618,6 +620,12 @@ impl Scheduler {
         unsafe {
             let task: *mut Task = Local::unsafe_borrow();
             (*task).sched.get_mut_ref().run_cleanup_job();
+
+            // See the comments in switch_running_tasks_and_then for why a lock
+            // is acquired here. This is the resumption points and the "bounce"
+            // that it is referring to.
+            (*task).nasty_deschedule_lock.lock();
+            (*task).nasty_deschedule_lock.unlock();
         }
     }
 
@@ -671,6 +679,15 @@ impl Scheduler {
     /// This passes a Scheduler pointer to the fn after the context switch
     /// in order to prevent that fn from performing further scheduling operations.
     /// Doing further scheduling could easily result in infinite recursion.
+    ///
+    /// Note that if the closure provided relinquishes ownership of the
+    /// BlockedTask, then it is possible for the task to resume execution before
+    /// the closure has finished executing. This would naturally introduce a
+    /// race if the closure and task shared portions of the environment.
+    ///
+    /// This situation is currently prevented, or in other words it is
+    /// guaranteed that this function will not return before the given closure
+    /// has returned.
     pub fn deschedule_running_task_and_then(mut ~self,
                                             f: |&mut Scheduler, BlockedTask|) {
         // Trickier - we need to get the scheduler task out of self
@@ -682,10 +699,29 @@ impl Scheduler {
 
     pub fn switch_running_tasks_and_then(~self, next_task: ~Task,
                                          f: |&mut Scheduler, BlockedTask|) {
-        // This is where we convert the BlockedTask-taking closure into one
-        // that takes just a Task
-        self.change_task_context(next_task, |sched, task| {
-            f(sched, BlockedTask::block(task))
+        // And here comes one of the sad moments in which a lock is used in a
+        // core portion of the rust runtime. As always, this is highly
+        // undesirable, so there's a good reason behind it.
+        //
+        // There is an excellent outline of the problem in issue #8132, and it's
+        // summarized in that `f` is executed on a sched task, but its
+        // environment is on the previous task. If `f` relinquishes ownership of
+        // the BlockedTask, then it may introduce a race where `f` is using the
+        // environment as well as the code after the 'deschedule' block.
+        //
+        // The solution we have chosen to adopt for now is to acquire a
+        // task-local lock around this block. The resumption of the task in
+        // context switching will bounce on the lock, thereby waiting for this
+        // block to finish, eliminating the race mentioned above.
+        //
+        // To actually maintain a handle to the lock, we use an unsafe pointer
+        // to it, but we're guaranteed that the task won't exit until we've
+        // unlocked the lock so there's no worry of this memory going away.
+        self.change_task_context(next_task, |sched, mut task| {
+            let lock: *mut Mutex = &mut task.nasty_deschedule_lock;
+            unsafe { (*lock).lock() }
+            f(sched, BlockedTask::block(task));
+            unsafe { (*lock).unlock() }
         })
     }
 
diff --git a/src/libstd/rt/task.rs b/src/libstd/rt/task.rs
@@ -37,6 +37,7 @@ use rt::sched::{Scheduler, SchedHandle};
 use rt::stack::{StackSegment, StackPool};
 use send_str::SendStr;
 use unstable::finally::Finally;
+use unstable::mutex::Mutex;
 
 // The Task struct represents all state associated with a rust
 // task. There are at this point two primary "subtypes" of task,
@@ -59,6 +60,9 @@ pub struct Task {
     // Dynamic borrowck debugging info
     borrow_list: Option<~[BorrowRecord]>,
     stdout_handle: Option<~Writer>,
+
+    // See the comments in the scheduler about why this is necessary
+    nasty_deschedule_lock: Mutex,
 }
 
 pub enum TaskType {
@@ -193,6 +197,7 @@ impl Task {
             task_type: SchedTask,
             borrow_list: None,
             stdout_handle: None,
+            nasty_deschedule_lock: unsafe { Mutex::new() },
         }
     }
 
@@ -227,6 +232,7 @@ impl Task {
             task_type: GreenTask(Some(home)),
             borrow_list: None,
             stdout_handle: None,
+            nasty_deschedule_lock: unsafe { Mutex::new() },
         }
     }
 
@@ -249,6 +255,7 @@ impl Task {
             task_type: GreenTask(Some(home)),
             borrow_list: None,
             stdout_handle: None,
+            nasty_deschedule_lock: unsafe { Mutex::new() },
         }
     }
 
@@ -391,6 +398,8 @@ impl Drop for Task {
     fn drop(&mut self) {
         rtdebug!("called drop for a task: {}", borrow::to_uint(self));
         rtassert!(self.destroyed);
+
+        unsafe { self.nasty_deschedule_lock.destroy(); }
     }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,7 @@ use rt::sched::{Scheduler, SchedHandle};`
`37`	`37`	`use rt::stack::{StackSegment, StackPool};`
`38`	`38`	`use send_str::SendStr;`
`39`	`39`	`use unstable::finally::Finally;`
	`40`	`+use unstable::mutex::Mutex;`
`40`	`41`
`41`	`42`	`// The Task struct represents all state associated with a rust`
`42`	`43`	`// task. There are at this point two primary "subtypes" of task,`
`@@ -59,6 +60,9 @@ pub struct Task {`
`59`	`60`	`// Dynamic borrowck debugging info`
`60`	`61`	`borrow_list: Option<~[BorrowRecord]>,`
`61`	`62`	`stdout_handle: Option<~Writer>,`
	`63`	`+`
	`64`	`+ // See the comments in the scheduler about why this is necessary`
	`65`	`+ nasty_deschedule_lock: Mutex,`
`62`	`66`	`}`
`63`	`67`
`64`	`68`	`pub enum TaskType {`
`@@ -193,6 +197,7 @@ impl Task {`
`193`	`197`	`task_type: SchedTask,`
`194`	`198`	`borrow_list: None,`
`195`	`199`	`stdout_handle: None,`
	`200`	`+ nasty_deschedule_lock: unsafe { Mutex::new() },`
`196`	`201`	`}`
`197`	`202`	`}`
`198`	`203`
`@@ -227,6 +232,7 @@ impl Task {`
`227`	`232`	`task_type: GreenTask(Some(home)),`
`228`	`233`	`borrow_list: None,`
`229`	`234`	`stdout_handle: None,`
	`235`	`+ nasty_deschedule_lock: unsafe { Mutex::new() },`
`230`	`236`	`}`
`231`	`237`	`}`
`232`	`238`
`@@ -249,6 +255,7 @@ impl Task {`
`249`	`255`	`task_type: GreenTask(Some(home)),`
`250`	`256`	`borrow_list: None,`
`251`	`257`	`stdout_handle: None,`
	`258`	`+ nasty_deschedule_lock: unsafe { Mutex::new() },`
`252`	`259`	`}`
`253`	`260`	`}`
`254`	`261`
`@@ -391,6 +398,8 @@ impl Drop for Task {`
`391`	`398`	`fn drop(&mut self) {`
`392`	`399`	`rtdebug!("called drop for a task: {}", borrow::to_uint(self));`
`393`	`400`	`rtassert!(self.destroyed);`
	`401`	`+`
	`402`	`+ unsafe { self.nasty_deschedule_lock.destroy(); }`
`394`	`403`	`}`
`395`	`404`	`}`
`396`	`405`