Fix deadlock with AVTTSEngine

mickael-menu · mickael-menu · commit acc3eae98c4f · 2022-05-19T15:26:16.000+02:00
diff --git a/Sources/Navigator/TTS/AVTTSEngine.swift b/Sources/Navigator/TTS/AVTTSEngine.swift
@@ -8,6 +8,7 @@ import AVFoundation
 import Foundation
 import R2Shared
 
+/// Implementation of a `TTSEngine` using Apple AVFoundation's `AVSpeechSynthesizer`.
 public class AVTTSEngine: NSObject, TTSEngine, AVSpeechSynthesizerDelegate, Loggable {
 
     /// Range of valid values for an AVUtterance rate.
@@ -28,12 +29,17 @@ public class AVTTSEngine: NSObject, TTSEngine, AVSpeechSynthesizerDelegate, Logg
 
     public let defaultConfig: TTSConfiguration
     public var config: TTSConfiguration
+    private let debug: Bool
 
     public weak var delegate: TTSEngineDelegate?
 
     private let synthesizer = AVSpeechSynthesizer()
 
-    public override init() {
+    /// Creates a new `AVTTSEngine` instance.
+    ///
+    /// - Parameters:
+    ///   - debug: Print the state machine transitions.
+    public init(debug: Bool = false) {
         let config = TTSConfiguration(
             defaultLanguage: Language(code: .bcp47(AVSpeechSynthesisVoice.currentLanguageCode())),
             rate: avRateRange.percentageForValue(Double(AVSpeechUtteranceDefaultSpeechRate)),
@@ -42,6 +48,7 @@ public class AVTTSEngine: NSObject, TTSEngine, AVSpeechSynthesizerDelegate, Logg
 
         self.defaultConfig = config
         self.config = config
+        self.debug = debug
 
         super.init()
         synthesizer.delegate = self
@@ -57,43 +64,32 @@ public class AVTTSEngine: NSObject, TTSEngine, AVSpeechSynthesizerDelegate, Logg
     }
 
     public func speak(_ utterance: TTSUtterance) {
-        synthesizer.stopSpeaking(at: .immediate)
-        synthesizer.speak(avUtterance(from: utterance))
+        on(.play(utterance))
     }
 
     public func stop() {
-        synthesizer.stopSpeaking(at: .immediate)
+        on(.stop)
     }
-
-    private func avUtterance(from utterance: TTSUtterance) -> AVSpeechUtterance {
-        let avUtterance = AVUtterance(utterance: utterance)
-        avUtterance.rate = Float(avRateRange.valueForPercentage(config.rate))
-        avUtterance.pitchMultiplier = Float(avPitchRange.valueForPercentage(config.pitch))
-        avUtterance.preUtteranceDelay = utterance.delay
-        avUtterance.postUtteranceDelay = config.delay
-        avUtterance.voice = voice(for: utterance)
-        return avUtterance
-    }
-
-    private func voice(for utterance: TTSUtterance) -> AVSpeechSynthesisVoice? {
-        let language = utterance.language ?? config.defaultLanguage
-        if let voice = config.voice, voice.language.removingRegion() == language.removingRegion() {
-            return AVSpeechSynthesisVoice(identifier: voice.identifier)
-        } else {
-            return AVSpeechSynthesisVoice(language: language)
+    
+    
+    // MARK: AVSpeechSynthesizerDelegate
+    
+    public func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didStart utterance: AVSpeechUtterance) {
+        guard let utterance = (utterance as? AVUtterance)?.utterance else {
+            return
         }
+        on(.didStart(utterance))
     }
 
     public func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
         guard let utterance = (utterance as? AVUtterance)?.utterance else {
             return
         }
-        delegate?.ttsEngine(self, didFinish: utterance)
+        on(.didFinish(utterance))
     }
 
     public func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, willSpeakRangeOfSpeechString characterRange: NSRange, utterance avUtterance: AVSpeechUtterance) {
         guard
-            let delegate = delegate,
             let utterance = (avUtterance as? AVUtterance)?.utterance,
             let highlight = utterance.locator.text.highlight,
             let range = Range(characterRange, in: highlight)
@@ -104,7 +100,7 @@ public class AVTTSEngine: NSObject, TTSEngine, AVSpeechSynthesizerDelegate, Logg
         let rangeLocator = utterance.locator.copy(
             text: { text in text = text[range] }
         )
-        delegate.ttsEngine(self, willSpeakRangeAt: rangeLocator, of: utterance)
+        on(.willSpeakRange(locator: rangeLocator, utterance: utterance))
     }
 
     private class AVUtterance: AVSpeechUtterance {
@@ -119,6 +115,204 @@ public class AVTTSEngine: NSObject, TTSEngine, AVSpeechSynthesizerDelegate, Logg
             fatalError("Not supported")
         }
     }
+
+    
+    // MARK: State machine
+    
+    // Submitting new utterances to `AVSpeechSynthesizer` when the `didStart` or
+    // `didFinish` events for the previous utterance were not received triggers
+    // a deadlock on iOS 15. The engine ignores the following requests.
+    //
+    // The following state machine is used to make sure we never send commands
+    // to the `AVSpeechSynthesizer` when it's not ready.
+    //
+    // To visualize it, paste the following dot graph in https://edotor.net
+    /*
+        digraph {
+            {
+                stopped [style=filled]
+            }
+
+            stopped -> starting [label = "play"]
+
+            starting -> playing [label = "didStart"]
+            starting -> stopping [label = "play/stop"]
+
+            playing -> stopped [label = "didFinish"]
+            playing -> stopping [label = "play/stop"]
+            playing -> playing [label = "willSpeakRange"]
+
+            stopping -> stopping [label = "play/stop"]
+            stopping -> stopping [label = "didStart"]
+            stopping -> starting [label = "didFinish w/ next"]
+            stopping -> stopped [label = "didFinish w/o next"]
+        }
+     */
+    
+    /// Represents a state of the TTS engine.
+    private enum State: Equatable {
+        /// The TTS engine is waiting for the next utterance to play.
+        case stopped
+        /// A new utterance is being processed by the TTS engine, we wait for didStart.
+        case starting(TTSUtterance)
+        /// The utterance is currently playing and the engine is ready to process other commands.
+        case playing(TTSUtterance)
+        /// The engine was stopped while processing the previous utterance, we wait for didStart
+        /// and/or didFinish. The queued utterance will be played once the engine is successfully stopped.
+        case stopping(TTSUtterance, queued: TTSUtterance?)
+        
+        mutating func on(_ event: Event) -> Effect? {
+            switch (self, event) {
+                
+            // stopped
+                
+            case let (.stopped, .play(utterance)):
+                self = .starting(utterance)
+                return .play(utterance)
+                
+            // starting
+                
+            case let (.starting(current), .didStart(started)) where current == started:
+                self = .playing(current)
+                return nil
+                
+            case let (.starting(current), .play(next)):
+                self = .stopping(current, queued: next)
+                return nil
+                
+            case let (.starting(current), .stop):
+                self = .stopping(current, queued: nil)
+                return nil
+                
+            // playing
+                
+            case let (.playing(current), .didFinish(finished)) where current == finished:
+                self = .stopped
+                return .notifyDidStopAfterLastUtterance(current)
+                
+            case let (.playing(current), .play(next)):
+                self = .stopping(current, queued: next)
+                return .stop
+                
+            case let (.playing(current), .stop):
+                self = .stopping(current, queued: nil)
+                return .stop
+                
+            case let (.playing(current), .willSpeakRange(locator: Locator, utterance: speaking)) where current == speaking:
+                return .notifyWillSpeakRange(locator: Locator, utterance: current)
+                
+            // stopping
+                
+            case let (.stopping(current, queued: next), .didStart(started)) where current == started:
+                self = .stopping(current, queued: next)
+                return .stop
+                
+            case let (.stopping(current, queued: next), .didFinish(finished)) where current == finished:
+                if let next = next {
+                    self = .starting(next)
+                    return .play(next)
+                } else {
+                    self = .stopped
+                    return .notifyDidStopAfterLastUtterance(current)
+                }
+                
+            case let (.stopping(current, queued: _), .play(next)):
+                self = .stopping(current, queued: next)
+                return nil
+                
+            case let (.stopping(current, queued: _), .stop):
+                self = .stopping(current, queued: nil)
+                return nil
+                
+                
+            default:
+                return nil
+            }
+        }
+    }
+    
+    /// State machine events triggered by the `AVSpeechSynthesizer` or the client
+    /// of `AVTTSEngine`.
+    private enum Event: Equatable {
+        // AVTTSEngine commands
+        case play(TTSUtterance)
+        case stop
+        
+        // AVSpeechSynthesizer delegate events
+        case didStart(TTSUtterance)
+        case willSpeakRange(locator: Locator, utterance: TTSUtterance)
+        case didFinish(TTSUtterance)
+    }
+    
+    /// State machine side effects triggered by a state transition from an event.
+    private enum Effect: Equatable {
+        // Ask `AVSpeechSynthesizer` to play the utterance.
+        case play(TTSUtterance)
+        // Ask `AVSpeechSynthesizer` to stop the playback.
+        case stop
+        
+        // Send notifications to our delegate.
+        case notifyWillSpeakRange(locator: Locator, utterance: TTSUtterance)
+        case notifyDidStopAfterLastUtterance(TTSUtterance)
+    }
+    
+    private var state: State = .stopped {
+        didSet {
+            if (debug) {
+                log(.debug, "* \(state)")
+            }
+        }
+    }
+    
+    /// Raises a TTS event triggering a state change and handles its side effects.
+    private func on(_ event: Event) {
+        assert(Thread.isMainThread, "Raising AVTTSEngine events must be done from the main thread")
+             
+        if (debug) {
+            log(.debug, "-> on \(event)")
+        }
+        
+        if let effect = state.on(event) {
+            handle(effect)
+        }
+    }
+    
+    /// Handles a state machine side effect.
+    private func handle(_ effect: Effect) {
+        switch effect {
+            
+        case let .play(utterance):
+            synthesizer.speak(avUtterance(from: utterance))
+            
+        case .stop:
+            synthesizer.stopSpeaking(at: .immediate)
+            
+        case let .notifyWillSpeakRange(locator: Locator, utterance: utterance):
+            delegate?.ttsEngine(self, willSpeakRangeAt: Locator, of: utterance)
+            
+        case let .notifyDidStopAfterLastUtterance(utterance):
+            delegate?.ttsEngine(self, didStopAfterLastUtterance: utterance)
+        }
+    }
+    
+    private func avUtterance(from utterance: TTSUtterance) -> AVSpeechUtterance {
+        let avUtterance = AVUtterance(utterance: utterance)
+        avUtterance.rate = Float(avRateRange.valueForPercentage(config.rate))
+        avUtterance.pitchMultiplier = Float(avPitchRange.valueForPercentage(config.pitch))
+        avUtterance.preUtteranceDelay = utterance.delay
+        avUtterance.postUtteranceDelay = config.delay
+        avUtterance.voice = voice(for: utterance)
+        return avUtterance
+    }
+    
+    private func voice(for utterance: TTSUtterance) -> AVSpeechSynthesisVoice? {
+        let language = utterance.language ?? config.defaultLanguage
+        if let voice = config.voice, voice.language.removingRegion() == language.removingRegion() {
+            return AVSpeechSynthesisVoice(identifier: voice.identifier)
+        } else {
+            return AVSpeechSynthesisVoice(language: language)
+        }
+    }
 }
 
 private extension TTSVoice {
@@ -169,4 +363,4 @@ private extension AVSpeechSynthesisVoice {
     convenience init?(language: Language) {
         self.init(language: language.code.bcp47)
     }
-}
+}
diff --git a/Sources/Navigator/TTS/TTSController.swift b/Sources/Navigator/TTS/TTSController.swift
@@ -280,8 +280,8 @@ public class TTSController: Loggable, TTSEngineDelegate {
 
     // MARK: - TTSEngineDelegate
 
-    public func ttsEngine(_ engine: TTSEngine, didFinish utterance: TTSUtterance) {
-        if isPlaying && currentUtterance == utterance {
+    public func ttsEngine(_ engine: TTSEngine, didStopAfterLastUtterance utterance: TTSUtterance) {
+        if isPlaying {
             next()
         }
     }
diff --git a/Sources/Navigator/TTS/TTSEngine.swift b/Sources/Navigator/TTS/TTSEngine.swift
@@ -20,7 +20,7 @@ public protocol TTSEngine: AnyObject {
 
 public protocol TTSEngineDelegate: AnyObject {
     func ttsEngine(_ engine: TTSEngine, willSpeakRangeAt locator: Locator, of utterance: TTSUtterance)
-    func ttsEngine(_ engine: TTSEngine, didFinish utterance: TTSUtterance)
+    func ttsEngine(_ engine: TTSEngine, didStopAfterLastUtterance utterance: TTSUtterance)
 }
 
 public struct TTSConfiguration {
diff --git a/TestApp/Sources/Reader/Common/ReaderViewController.swift b/TestApp/Sources/Reader/Common/ReaderViewController.swift
@@ -124,7 +124,7 @@ class ReaderViewController: UIViewController, Loggable {
             controls.didMove(toParent: self)
 
             state
-                .sink { [unowned self] state in
+                .sink { state in
                     controls.view.isHidden = (state == .stopped)
                 }
                 .store(in: &subscriptions)
diff --git a/TestApp/Sources/Reader/Common/TTS/TTSViewModel.swift b/TestApp/Sources/Reader/Common/TTS/TTSViewModel.swift
@@ -45,7 +45,7 @@ final class TTSViewModel: ObservableObject, Loggable {
         var isMoving = false
         playingRangeLocatorSubject
             .throttle(for: 1, scheduler: RunLoop.main, latest: true)
-            .sink { [unowned self] locator in
+            .sink { locator in
                 guard !isMoving else {
                     return
                 }
@@ -131,4 +131,4 @@ extension TTSViewModel: TTSControllerDelegate {
     public func ttsController(_ ttsController: TTSController, willSpeakRangeAt locator: Locator, of utterance: TTSUtterance) {
         playingRangeLocatorSubject.send(locator)
     }
-}
+}

Original file line number	Diff line number	Diff line change
`@@ -280,8 +280,8 @@ public class TTSController: Loggable, TTSEngineDelegate {`
`280`	`280`
`281`	`281`	`// MARK: - TTSEngineDelegate`
`282`	`282`
`283`		`- public func ttsEngine(_ engine: TTSEngine, didFinish utterance: TTSUtterance) {`
`284`		`- if isPlaying && currentUtterance == utterance {`
	`283`	`+ public func ttsEngine(_ engine: TTSEngine, didStopAfterLastUtterance utterance: TTSUtterance) {`
	`284`	`+ if isPlaying {`
`285`	`285`	`next()`
`286`	`286`	`}`
`287`	`287`	`}`
Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@ public protocol TTSEngine: AnyObject {`
`20`	`20`
`21`	`21`	`public protocol TTSEngineDelegate: AnyObject {`
`22`	`22`	`func ttsEngine(_ engine: TTSEngine, willSpeakRangeAt locator: Locator, of utterance: TTSUtterance)`
`23`		`- func ttsEngine(_ engine: TTSEngine, didFinish utterance: TTSUtterance)`
	`23`	`+ func ttsEngine(_ engine: TTSEngine, didStopAfterLastUtterance utterance: TTSUtterance)`
`24`	`24`	`}`
`25`	`25`
`26`	`26`	`public struct TTSConfiguration {`
Original file line number	Diff line number	Diff line change
`@@ -124,7 +124,7 @@ class ReaderViewController: UIViewController, Loggable {`
`124`	`124`	`controls.didMove(toParent: self)`
`125`	`125`
`126`	`126`	`state`
`127`		`- .sink { [unowned self] state in`
	`127`	`+ .sink { state in`
`128`	`128`	`controls.view.isHidden = (state == .stopped)`
`129`	`129`	`}`
`130`	`130`	`.store(in: &subscriptions)`
Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ final class TTSViewModel: ObservableObject, Loggable {`
`45`	`45`	`var isMoving = false`
`46`	`46`	`playingRangeLocatorSubject`
`47`	`47`	`.throttle(for: 1, scheduler: RunLoop.main, latest: true)`
`48`		`- .sink { [unowned self] locator in`
	`48`	`+ .sink { locator in`
`49`	`49`	`guard !isMoving else {`
`50`	`50`	`return`
`51`	`51`	`}`
`@@ -131,4 +131,4 @@ extension TTSViewModel: TTSControllerDelegate {`
`131`	`131`	`public func ttsController(_ ttsController: TTSController, willSpeakRangeAt locator: Locator, of utterance: TTSUtterance) {`
`132`	`132`	`playingRangeLocatorSubject.send(locator)`
`133`	`133`	`}`
`134`		`-}`
	`134`	`+}`