@@ -181,16 +181,19 @@ export namespace RealtimeSessionCreateResponse {
181181 /**
182182 * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
183183 * set to `null` to turn off, in which case the client must manually trigger model
184- * response. Server VAD means that the model will detect the start and end of
185- * speech based on audio volume and respond at the end of user speech. Semantic VAD
186- * is more advanced and uses a turn detection model (in conjunction with VAD) to
187- * semantically estimate whether the user has finished speaking, then dynamically
188- * sets a timeout based on this probability. For example, if user audio trails off
189- * with "uhhm", the model will score a low probability of turn end and wait longer
190- * for the user to continue speaking. This can be useful for more natural
191- * conversations, but may have a higher latency.
184+ * response.
185+ *
186+ * Server VAD means that the model will detect the start and end of speech based on
187+ * audio volume and respond at the end of user speech.
188+ *
189+ * Semantic VAD is more advanced and uses a turn detection model (in conjunction
190+ * with VAD) to semantically estimate whether the user has finished speaking, then
191+ * dynamically sets a timeout based on this probability. For example, if user audio
192+ * trails off with "uhhm", the model will score a low probability of turn end and
193+ * wait longer for the user to continue speaking. This can be useful for more
194+ * natural conversations, but may have a higher latency.
192195 */
193- turn_detection ?: Input . TurnDetection ;
196+ turn_detection ?: Input . ServerVad | Input . SemanticVad | null ;
194197 }
195198
196199 export namespace Input {
@@ -211,35 +214,34 @@ export namespace RealtimeSessionCreateResponse {
211214 }
212215
213216 /**
214- * Configuration for turn detection, ether Server VAD or Semantic VAD. This can be
215- * set to `null` to turn off, in which case the client must manually trigger model
216- * response. Server VAD means that the model will detect the start and end of
217- * speech based on audio volume and respond at the end of user speech. Semantic VAD
218- * is more advanced and uses a turn detection model (in conjunction with VAD) to
219- * semantically estimate whether the user has finished speaking, then dynamically
220- * sets a timeout based on this probability. For example, if user audio trails off
221- * with "uhhm", the model will score a low probability of turn end and wait longer
222- * for the user to continue speaking. This can be useful for more natural
223- * conversations, but may have a higher latency.
217+ * Server-side voice activity detection (VAD) which flips on when user speech is
218+ * detected and off after a period of silence.
224219 */
225- export interface TurnDetection {
220+ export interface ServerVad {
226221 /**
227- * Whether or not to automatically generate a response when a VAD stop event
228- * occurs.
222+ * Type of turn detection, `server_vad` to turn on simple Server VAD.
229223 */
230- create_response ?: boolean ;
224+ type : 'server_vad' ;
231225
232226 /**
233- * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
234- * will wait longer for the user to continue speaking, `high` will respond more
235- * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
236- * and `high` have max timeouts of 8s, 4s, and 2s respectively.
227+ * Whether or not to automatically generate a response when a VAD stop event
228+ * occurs.
237229 */
238- eagerness ?: 'low' | 'medium' | 'high' | 'auto' ;
230+ create_response ?: boolean ;
239231
240232 /**
241- * Optional idle timeout after which turn detection will auto-timeout when no
242- * additional audio is received and emits a `timeout_triggered` event.
233+ * Optional timeout after which a model response will be triggered automatically.
234+ * This is useful for situations in which a long pause from the user is unexpected,
235+ * such as a phone call. The model will effectively prompt the user to continue the
236+ * conversation based on the current context.
237+ *
238+ * The timeout value will be applied after the last model response's audio has
239+ * finished playing, i.e. it's set to the `response.done` time plus audio playback
240+ * duration.
241+ *
242+ * An `input_audio_buffer.timeout_triggered` event (plus events associated with the
243+ * Response) will be emitted when the timeout is reached. Idle timeout is currently
244+ * only supported for `server_vad` mode.
243245 */
244246 idle_timeout_ms ?: number | null ;
245247
@@ -269,11 +271,38 @@ export namespace RealtimeSessionCreateResponse {
269271 * model, and thus might perform better in noisy environments.
270272 */
271273 threshold ?: number ;
274+ }
272275
276+ /**
277+ * Server-side semantic turn detection which uses a model to determine when the
278+ * user has finished speaking.
279+ */
280+ export interface SemanticVad {
273281 /**
274- * Type of turn detection.
282+ * Type of turn detection, `semantic_vad` to turn on Semantic VAD .
275283 */
276- type ?: 'server_vad' | 'semantic_vad' ;
284+ type : 'semantic_vad' ;
285+
286+ /**
287+ * Whether or not to automatically generate a response when a VAD stop event
288+ * occurs.
289+ */
290+ create_response ?: boolean ;
291+
292+ /**
293+ * Used only for `semantic_vad` mode. The eagerness of the model to respond. `low`
294+ * will wait longer for the user to continue speaking, `high` will respond more
295+ * quickly. `auto` is the default and is equivalent to `medium`. `low`, `medium`,
296+ * and `high` have max timeouts of 8s, 4s, and 2s respectively.
297+ */
298+ eagerness ?: 'low' | 'medium' | 'high' | 'auto' ;
299+
300+ /**
301+ * Whether or not to automatically interrupt any ongoing response with output to
302+ * the default conversation (i.e. `conversation` of `auto`) when a VAD start event
303+ * occurs.
304+ */
305+ interrupt_response ?: boolean ;
277306 }
278307 }
279308
0 commit comments