@@ -8,6 +8,7 @@ import AVFoundation
88import Foundation
99import R2Shared
1010
11+ /// Implementation of a `TTSEngine` using Apple AVFoundation's `AVSpeechSynthesizer`.
1112public class AVTTSEngine : NSObject , TTSEngine , AVSpeechSynthesizerDelegate , Loggable {
1213
1314 /// Range of valid values for an AVUtterance rate.
@@ -28,12 +29,17 @@ public class AVTTSEngine: NSObject, TTSEngine, AVSpeechSynthesizerDelegate, Logg
2829
2930 public let defaultConfig : TTSConfiguration
3031 public var config : TTSConfiguration
32+ private let debug : Bool
3133
3234 public weak var delegate : TTSEngineDelegate ?
3335
3436 private let synthesizer = AVSpeechSynthesizer ( )
3537
36- public override init ( ) {
38+ /// Creates a new `AVTTSEngine` instance.
39+ ///
40+ /// - Parameters:
41+ /// - debug: Print the state machine transitions.
42+ public init ( debug: Bool = false ) {
3743 let config = TTSConfiguration (
3844 defaultLanguage: Language ( code: . bcp47( AVSpeechSynthesisVoice . currentLanguageCode ( ) ) ) ,
3945 rate: avRateRange. percentageForValue ( Double ( AVSpeechUtteranceDefaultSpeechRate) ) ,
@@ -42,6 +48,7 @@ public class AVTTSEngine: NSObject, TTSEngine, AVSpeechSynthesizerDelegate, Logg
4248
4349 self . defaultConfig = config
4450 self . config = config
51+ self . debug = debug
4552
4653 super. init ( )
4754 synthesizer. delegate = self
@@ -57,43 +64,32 @@ public class AVTTSEngine: NSObject, TTSEngine, AVSpeechSynthesizerDelegate, Logg
5764 }
5865
5966 public func speak( _ utterance: TTSUtterance ) {
60- synthesizer. stopSpeaking ( at: . immediate)
61- synthesizer. speak ( avUtterance ( from: utterance) )
67+ on ( . play( utterance) )
6268 }
6369
6470 public func stop( ) {
65- synthesizer . stopSpeaking ( at : . immediate )
71+ on ( . stop )
6672 }
67-
68- private func avUtterance( from utterance: TTSUtterance ) -> AVSpeechUtterance {
69- let avUtterance = AVUtterance ( utterance: utterance)
70- avUtterance. rate = Float ( avRateRange. valueForPercentage ( config. rate) )
71- avUtterance. pitchMultiplier = Float ( avPitchRange. valueForPercentage ( config. pitch) )
72- avUtterance. preUtteranceDelay = utterance. delay
73- avUtterance. postUtteranceDelay = config. delay
74- avUtterance. voice = voice ( for: utterance)
75- return avUtterance
76- }
77-
78- private func voice( for utterance: TTSUtterance ) -> AVSpeechSynthesisVoice ? {
79- let language = utterance. language ?? config. defaultLanguage
80- if let voice = config. voice, voice. language. removingRegion ( ) == language. removingRegion ( ) {
81- return AVSpeechSynthesisVoice ( identifier: voice. identifier)
82- } else {
83- return AVSpeechSynthesisVoice ( language: language)
73+
74+
75+ // MARK: AVSpeechSynthesizerDelegate
76+
77+ public func speechSynthesizer( _ synthesizer: AVSpeechSynthesizer , didStart utterance: AVSpeechUtterance ) {
78+ guard let utterance = ( utterance as? AVUtterance ) ? . utterance else {
79+ return
8480 }
81+ on ( . didStart( utterance) )
8582 }
8683
8784 public func speechSynthesizer( _ synthesizer: AVSpeechSynthesizer , didFinish utterance: AVSpeechUtterance ) {
8885 guard let utterance = ( utterance as? AVUtterance ) ? . utterance else {
8986 return
9087 }
91- delegate ? . ttsEngine ( self , didFinish: utterance)
88+ on ( . didFinish( utterance) )
9289 }
9390
9491 public func speechSynthesizer( _ synthesizer: AVSpeechSynthesizer , willSpeakRangeOfSpeechString characterRange: NSRange , utterance avUtterance: AVSpeechUtterance ) {
9592 guard
96- let delegate = delegate,
9793 let utterance = ( avUtterance as? AVUtterance ) ? . utterance,
9894 let highlight = utterance. locator. text. highlight,
9995 let range = Range ( characterRange, in: highlight)
@@ -104,7 +100,7 @@ public class AVTTSEngine: NSObject, TTSEngine, AVSpeechSynthesizerDelegate, Logg
104100 let rangeLocator = utterance. locator. copy (
105101 text: { text in text = text [ range] }
106102 )
107- delegate . ttsEngine ( self , willSpeakRangeAt : rangeLocator, of : utterance)
103+ on ( . willSpeakRange ( locator : rangeLocator, utterance : utterance) )
108104 }
109105
110106 private class AVUtterance : AVSpeechUtterance {
@@ -119,6 +115,204 @@ public class AVTTSEngine: NSObject, TTSEngine, AVSpeechSynthesizerDelegate, Logg
119115 fatalError ( " Not supported " )
120116 }
121117 }
118+
119+
120+ // MARK: State machine
121+
122+ // Submitting new utterances to `AVSpeechSynthesizer` when the `didStart` or
123+ // `didFinish` events for the previous utterance were not received triggers
124+ // a deadlock on iOS 15. The engine ignores the following requests.
125+ //
126+ // The following state machine is used to make sure we never send commands
127+ // to the `AVSpeechSynthesizer` when it's not ready.
128+ //
129+ // To visualize it, paste the following dot graph in https://edotor.net
130+ /*
131+ digraph {
132+ {
133+ stopped [style=filled]
134+ }
135+
136+ stopped -> starting [label = "play"]
137+
138+ starting -> playing [label = "didStart"]
139+ starting -> stopping [label = "play/stop"]
140+
141+ playing -> stopped [label = "didFinish"]
142+ playing -> stopping [label = "play/stop"]
143+ playing -> playing [label = "willSpeakRange"]
144+
145+ stopping -> stopping [label = "play/stop"]
146+ stopping -> stopping [label = "didStart"]
147+ stopping -> starting [label = "didFinish w/ next"]
148+ stopping -> stopped [label = "didFinish w/o next"]
149+ }
150+ */
151+
152+ /// Represents a state of the TTS engine.
153+ private enum State : Equatable {
154+ /// The TTS engine is waiting for the next utterance to play.
155+ case stopped
156+ /// A new utterance is being processed by the TTS engine, we wait for didStart.
157+ case starting( TTSUtterance )
158+ /// The utterance is currently playing and the engine is ready to process other commands.
159+ case playing( TTSUtterance )
160+ /// The engine was stopped while processing the previous utterance, we wait for didStart
161+ /// and/or didFinish. The queued utterance will be played once the engine is successfully stopped.
162+ case stopping( TTSUtterance , queued: TTSUtterance ? )
163+
164+ mutating func on( _ event: Event ) -> Effect ? {
165+ switch ( self , event) {
166+
167+ // stopped
168+
169+ case let ( . stopped, . play( utterance) ) :
170+ self = . starting( utterance)
171+ return . play( utterance)
172+
173+ // starting
174+
175+ case let ( . starting( current) , . didStart( started) ) where current == started:
176+ self = . playing( current)
177+ return nil
178+
179+ case let ( . starting( current) , . play( next) ) :
180+ self = . stopping( current, queued: next)
181+ return nil
182+
183+ case let ( . starting( current) , . stop) :
184+ self = . stopping( current, queued: nil )
185+ return nil
186+
187+ // playing
188+
189+ case let ( . playing( current) , . didFinish( finished) ) where current == finished:
190+ self = . stopped
191+ return . notifyDidStopAfterLastUtterance( current)
192+
193+ case let ( . playing( current) , . play( next) ) :
194+ self = . stopping( current, queued: next)
195+ return . stop
196+
197+ case let ( . playing( current) , . stop) :
198+ self = . stopping( current, queued: nil )
199+ return . stop
200+
201+ case let ( . playing( current) , . willSpeakRange( locator: Locator, utterance: speaking) ) where current == speaking:
202+ return . notifyWillSpeakRange( locator: Locator, utterance: current)
203+
204+ // stopping
205+
206+ case let ( . stopping( current, queued: next) , . didStart( started) ) where current == started:
207+ self = . stopping( current, queued: next)
208+ return . stop
209+
210+ case let ( . stopping( current, queued: next) , . didFinish( finished) ) where current == finished:
211+ if let next = next {
212+ self = . starting( next)
213+ return . play( next)
214+ } else {
215+ self = . stopped
216+ return . notifyDidStopAfterLastUtterance( current)
217+ }
218+
219+ case let ( . stopping( current, queued: _) , . play( next) ) :
220+ self = . stopping( current, queued: next)
221+ return nil
222+
223+ case let ( . stopping( current, queued: _) , . stop) :
224+ self = . stopping( current, queued: nil )
225+ return nil
226+
227+
228+ default :
229+ return nil
230+ }
231+ }
232+ }
233+
234+ /// State machine events triggered by the `AVSpeechSynthesizer` or the client
235+ /// of `AVTTSEngine`.
236+ private enum Event : Equatable {
237+ // AVTTSEngine commands
238+ case play( TTSUtterance )
239+ case stop
240+
241+ // AVSpeechSynthesizer delegate events
242+ case didStart( TTSUtterance )
243+ case willSpeakRange( locator: Locator , utterance: TTSUtterance )
244+ case didFinish( TTSUtterance )
245+ }
246+
247+ /// State machine side effects triggered by a state transition from an event.
248+ private enum Effect : Equatable {
249+ // Ask `AVSpeechSynthesizer` to play the utterance.
250+ case play( TTSUtterance )
251+ // Ask `AVSpeechSynthesizer` to stop the playback.
252+ case stop
253+
254+ // Send notifications to our delegate.
255+ case notifyWillSpeakRange( locator: Locator , utterance: TTSUtterance )
256+ case notifyDidStopAfterLastUtterance( TTSUtterance )
257+ }
258+
259+ private var state : State = . stopped {
260+ didSet {
261+ if ( debug) {
262+ log ( . debug, " * \( state) " )
263+ }
264+ }
265+ }
266+
267+ /// Raises a TTS event triggering a state change and handles its side effects.
268+ private func on( _ event: Event ) {
269+ assert ( Thread . isMainThread, " Raising AVTTSEngine events must be done from the main thread " )
270+
271+ if ( debug) {
272+ log ( . debug, " -> on \( event) " )
273+ }
274+
275+ if let effect = state. on ( event) {
276+ handle ( effect)
277+ }
278+ }
279+
280+ /// Handles a state machine side effect.
281+ private func handle( _ effect: Effect ) {
282+ switch effect {
283+
284+ case let . play( utterance) :
285+ synthesizer. speak ( avUtterance ( from: utterance) )
286+
287+ case . stop:
288+ synthesizer. stopSpeaking ( at: . immediate)
289+
290+ case let . notifyWillSpeakRange( locator: Locator, utterance: utterance) :
291+ delegate? . ttsEngine ( self , willSpeakRangeAt: Locator, of: utterance)
292+
293+ case let . notifyDidStopAfterLastUtterance( utterance) :
294+ delegate? . ttsEngine ( self , didStopAfterLastUtterance: utterance)
295+ }
296+ }
297+
298+ private func avUtterance( from utterance: TTSUtterance ) -> AVSpeechUtterance {
299+ let avUtterance = AVUtterance ( utterance: utterance)
300+ avUtterance. rate = Float ( avRateRange. valueForPercentage ( config. rate) )
301+ avUtterance. pitchMultiplier = Float ( avPitchRange. valueForPercentage ( config. pitch) )
302+ avUtterance. preUtteranceDelay = utterance. delay
303+ avUtterance. postUtteranceDelay = config. delay
304+ avUtterance. voice = voice ( for: utterance)
305+ return avUtterance
306+ }
307+
308+ private func voice( for utterance: TTSUtterance ) -> AVSpeechSynthesisVoice ? {
309+ let language = utterance. language ?? config. defaultLanguage
310+ if let voice = config. voice, voice. language. removingRegion ( ) == language. removingRegion ( ) {
311+ return AVSpeechSynthesisVoice ( identifier: voice. identifier)
312+ } else {
313+ return AVSpeechSynthesisVoice ( language: language)
314+ }
315+ }
122316}
123317
124318private extension TTSVoice {
@@ -169,4 +363,4 @@ private extension AVSpeechSynthesisVoice {
169363 convenience init ? ( language: Language ) {
170364 self . init ( language: language. code. bcp47)
171365 }
172- }
366+ }
0 commit comments