diff --git a/src/realtime/api.rs b/src/realtime/api.rs index 82faeb8b..cf901b39 100644 --- a/src/realtime/api.rs +++ b/src/realtime/api.rs @@ -7,6 +7,8 @@ use tokio_tungstenite::{ MaybeTlsStream, WebSocketStream, }; +pub mod sip; + const WSS_URL: &str = "wss://api.openai.com/v1/realtime"; pub struct RealtimeClient { diff --git a/src/realtime/api/sip.rs b/src/realtime/api/sip.rs new file mode 100644 index 00000000..066cf368 --- /dev/null +++ b/src/realtime/api/sip.rs @@ -0,0 +1,181 @@ +use serde::{Deserialize, Serialize}; + +use super::*; + +/// Intended for connecting to an already existing Realtime session spawned by accepting an incoming SIP call from e.g. Twilio. +pub struct RealtimeSipClient { + pub wss_url: String, + pub api_key: String, + pub call_id: String, +} + +impl RealtimeSipClient { + pub fn new(api_key: String, call_id: String) -> Self { + let wss_url = std::env::var("WSS_URL").unwrap_or_else(|_| WSS_URL.to_owned()); + Self::new_with_endpoint(wss_url, api_key, call_id) + } + + pub fn new_with_endpoint(wss_url: String, api_key: String, call_id: String) -> Self { + Self { + wss_url, + api_key, + call_id, + } + } + + pub async fn connect( + &self, + ) -> Result< + ( + SplitSink>, Message>, + SplitStream>>, + ), + Box, + > { + let url = format!("{}?call_id={}", self.wss_url, self.call_id); + let mut request = url.into_client_request()?; + let api_key = self.api_key.clone(); + request + .headers_mut() + .insert("Authorization", format!("Bearer {api_key}").parse()?); + let (ws_stream, _) = connect_async(request).await?; + let (write, read) = ws_stream.split(); + Ok((write, read)) + } +} + +/// This is the payload of a `realtime.call.incoming` event webhook which is what OpenAI sends to your application when a call hits the SIP endpoint for your project. +/// Exposes some convenience methods for when a call comes from Twilio which is one of the more common use cases. `openai_call_id()` is what you will need to use accept/hangup endpoints. +/// +/// # Example +/// ```rust +/// const INSTRUCTIONS: &str = "You are a helpful assistant."; +/// #[axum::debug_handler] +/// async fn call_webhook( +/// State(mut state): State, +/// Json(event): Json, +/// ) -> impl IntoResponse { +/// let number = event.caller_number(); +/// let call_id = event.openai_call_id(); +/// let twilio_sid = event.twilio_call_sid(); +/// let account_sid = event.twilio_account_sid(); +/// log::info!( +/// "Call coming in from {:?} with OpenAi ID {:?}, Twilio SID {:?} / account SID {:?}", +/// number, +/// call_id, +/// twilio_sid, +/// account_sid +/// ); +/// +/// let accept_call = AcceptCallRequest::new(INSTRUCTIONS, RealtimeModel::GptRealtime); +/// +/// match state.openai_client.accept_call(call_id, accept_call).await { +/// Ok(_) => { +/// log::info!("Accepted call {}", call_id); +/// } +/// Err(err) => { +/// log::error!("Failed to accept call {}: {}", call_id, err); +/// } +/// }; +/// () +/// } +/// ``` +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RealtimeCallIncoming { + pub id: String, + /// Always `event`. + pub object: String, + pub created_at: i64, + /// This should always be `realtime.call.incoming`. + #[serde(rename = "type")] + pub event_type: String, + /// Contains the actual unique data per call. Look for `call_id` here or call `openai_call_id()`. + pub data: RealTimeCallIncomingData, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RealTimeCallIncomingData { + pub call_id: String, + pub sip_headers: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SipHeader { + pub name: String, + pub value: String, +} + +impl RealtimeCallIncoming { + /// Get the call ID from the event data + pub fn openai_call_id(&self) -> &str { + &self.data.call_id + } + + /// Extract the caller's phone number from the "From" SIP header + pub fn caller_number(&self) -> Option { + self.data + .sip_headers + .iter() + .find(|header| header.name == "From") + .and_then(|header| { + // Parse the From header to extract the phone number + // Format: "+48123123123" ;tag=... + if let Some(start) = header.value.find('"') { + if let Some(end) = header.value[start + 1..].find('"') { + return Some(header.value[start + 1..start + 1 + end].to_string()); + } + } + None + }) + } + + /// Get the Twilio Call SID from the X-Twilio-CallSid SIP header + pub fn twilio_call_sid(&self) -> Option<&str> { + self.data + .sip_headers + .iter() + .find(|header| header.name == "X-Twilio-CallSid") + .map(|header| header.value.as_str()) + } + + /// Get the Twilio Account SID from the X-Twilio-AccountSid SIP header + pub fn twilio_account_sid(&self) -> Option<&str> { + self.data + .sip_headers + .iter() + .find(|header| header.name == "X-Twilio-AccountSid") + .map(|header| header.value.as_str()) + } + + /// Get a specific SIP header value by name + pub fn get_sip_header(&self, name: &str) -> Option<&str> { + self.data + .sip_headers + .iter() + .find(|header| header.name == name) + .map(|header| header.value.as_str()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_twilio_event() { + let json = r#"{"id": "evt_68bc6828707881908be189456b84cc07", "object": "event", "created_at": 1757177896, "type": "realtime.call.incoming", "data": {"call_id": "rtc_c5b6f97fe96f4c809b78916a9ac15748", "sip_headers": [{"name": "From", "value": "\"+48123123123\" ;tag=82568196_c3356d0b_03f1232a-01cf-4a4a-af25-bac077219d08"}, {"name": "X-Twilio-CallSid", "value": "CA080dd4bebc0320639d7ae33b82e80481"}, {"name": "X-Twilio-AccountSid", "value": "fake_data"}]}}"#; + + let event: RealtimeCallIncoming = serde_json::from_str(json).unwrap(); + + assert_eq!( + event.openai_call_id(), + "rtc_c5b6f97fe96f4c809b78916a9ac15748" + ); + assert_eq!(event.caller_number(), Some("+48123123123".to_string())); + assert_eq!( + event.twilio_call_sid(), + Some("CA080dd4bebc0320639d7ae33b82e80481") + ); + assert_eq!(event.twilio_account_sid(), Some("fake_data")); + } +} diff --git a/src/realtime/client_event.rs b/src/realtime/client_event.rs index 53805381..aa17e5d3 100644 --- a/src/realtime/client_event.rs +++ b/src/realtime/client_event.rs @@ -1,7 +1,7 @@ use serde::{Deserialize, Serialize}; use tokio_tungstenite::tungstenite::Message; -use crate::realtime::types::{Item, Session}; +use crate::realtime::types::{Item, RealtimeSession, Session}; #[derive(Debug, Serialize, Deserialize, Clone, Default)] pub struct SessionUpdate { @@ -58,7 +58,7 @@ pub struct ConversationItemDelete { pub struct ResponseCreate { #[serde(skip_serializing_if = "Option::is_none")] pub event_id: Option, - pub response: Option, + pub response: Option, } #[derive(Debug, Serialize, Deserialize, Clone, Default)] diff --git a/src/realtime/server_event.rs b/src/realtime/server_event.rs index b02a139d..1de493d2 100644 --- a/src/realtime/server_event.rs +++ b/src/realtime/server_event.rs @@ -55,7 +55,14 @@ pub struct InputAudioBufferSpeechStopped { } #[derive(Debug, Serialize, Deserialize, Clone)] -pub struct ConversationItemCreated { +pub struct ConversationItemAdded { + pub event_id: String, + pub previous_item_id: Option, + pub item: Item, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ConversationItemDone { pub event_id: String, pub previous_item_id: Option, pub item: Item, @@ -69,6 +76,15 @@ pub struct ConversationItemInputAudioTranscriptionCompleted { pub transcript: String, } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ConversationItemInputAudioTranscriptionDelta { + pub event_id: String, + pub item_id: String, + pub content_index: u32, + pub delta: String, + // todo: add logprobs support +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct ConversationItemInputAudioTranscriptionFailed { pub event_id: String, @@ -178,7 +194,7 @@ pub struct ResponseTextDone { } #[derive(Debug, Serialize, Deserialize, Clone)] -pub struct ResponseAudioTranscriptDelta { +pub struct ResponseOutputAudioTranscriptDelta { pub event_id: String, pub response_id: String, pub item_id: String, @@ -188,7 +204,7 @@ pub struct ResponseAudioTranscriptDelta { } #[derive(Debug, Serialize, Deserialize, Clone)] -pub struct ResponseAudioTranscriptDone { +pub struct ResponseOutputAudioTranscriptDone { pub event_id: String, pub response_id: String, pub item_id: String, @@ -198,7 +214,7 @@ pub struct ResponseAudioTranscriptDone { } #[derive(Debug, Serialize, Deserialize, Clone)] -pub struct ResponseAudioDelta { +pub struct ResponseOutputAudioDelta { pub event_id: String, pub response_id: String, pub item_id: String, @@ -208,7 +224,7 @@ pub struct ResponseAudioDelta { } #[derive(Debug, Serialize, Deserialize, Clone)] -pub struct ResponseAudioDone { +pub struct ResponseOutputAudioDone { pub event_id: String, pub response_id: String, pub item_id: String, @@ -236,12 +252,71 @@ pub struct ResponseFunctionCallArgumentsDone { pub arguments: String, } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ResponseMcpCallArgumentsDelta { + pub event_id: String, + pub item_id: String, + #[serde(default)] + pub obfuscation: Option, + pub output_index: u32, + pub response_id: String, + pub delta: String, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ResponseMcpCallArgumentsDone { + pub event_id: String, + pub item_id: String, + pub output_index: u32, + pub response_id: String, + pub arguments: String, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ResponseMcpCallInProgress { + pub event_id: String, + pub item_id: String, + pub output_index: u32, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ResponseMcpCallCompleted { + pub event_id: String, + pub item_id: String, + pub output_index: u32, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ResponseMcpCallFailed { + pub event_id: String, + pub item_id: String, + pub output_index: u32, +} + #[derive(Debug, Serialize, Deserialize, Clone)] pub struct RateLimitsUpdated { pub event_id: String, pub rate_limits: Vec, } +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct McpListToolsInProgress { + pub event_id: String, + pub item_id: String, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct McpListToolsCompleted { + pub event_id: String, + pub item_id: String, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct McpListToolsFailed { + pub event_id: String, + pub item_id: String, +} + #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(tag = "type")] pub enum ServerEvent { @@ -261,18 +336,22 @@ pub enum ServerEvent { InputAudioBufferSpeechStarted(InputAudioBufferSpeechStarted), #[serde(rename = "input_audio_buffer.speech_stopped")] InputAudioBufferSpeechStopped(InputAudioBufferSpeechStopped), - #[serde(rename = "conversation.item.created")] - ConversationItemCreated(ConversationItemCreated), + #[serde(rename = "conversation.item.added")] + ConversationItemAdded(ConversationItemAdded), #[serde(rename = "conversation.item.input_audio_transcription.completed")] ConversationItemInputAudioTranscriptionCompleted( ConversationItemInputAudioTranscriptionCompleted, ), + #[serde(rename = "conversation.item.input_audio_transcription.delta")] + ConversationItemInputAudioTranscriptionDelta(ConversationItemInputAudioTranscriptionDelta), #[serde(rename = "conversation.item.input_audio_transcription.failed")] ConversationItemInputAudioTranscriptionFailed(ConversationItemInputAudioTranscriptionFailed), #[serde(rename = "conversation.item.truncated")] ConversationItemTruncated(ConversationItemTruncated), #[serde(rename = "conversation.item.deleted")] ConversationItemDeleted(ConversationItemDeleted), + #[serde(rename = "conversation.item.done")] + ConversationItemDone(ConversationItemDone), #[serde(rename = "output_audio_buffer.started")] OutputAudioBufferStarted(OutputAudioBufferStarted), #[serde(rename = "output_audio_buffer.stopped")] @@ -295,18 +374,34 @@ pub enum ServerEvent { ResponseTextDelta(ResponseTextDelta), #[serde(rename = "response.text.done")] ResponseTextDone(ResponseTextDone), - #[serde(rename = "response.audio_transcript.delta")] - ResponseAudioTranscriptDelta(ResponseAudioTranscriptDelta), - #[serde(rename = "response.audio_transcript.done")] - ResponseAudioTranscriptDone(ResponseAudioTranscriptDone), - #[serde(rename = "response.audio.delta")] - ResponseAudioDelta(ResponseAudioDelta), - #[serde(rename = "response.audio.done")] - ResponseAudioDone(ResponseAudioDone), + #[serde(rename = "response.output_audio_transcript.delta")] + ResponseOutputAudioTranscriptDelta(ResponseOutputAudioTranscriptDelta), + #[serde(rename = "response.output_audio_transcript.done")] + ResponseOutputAudioTranscriptDone(ResponseOutputAudioTranscriptDone), + #[serde(rename = "response.output_audio.delta")] + ResponseOutputAudioDelta(ResponseOutputAudioDelta), + #[serde(rename = "response.output_audio.done")] + ResponseOutputAudioDone(ResponseOutputAudioDone), #[serde(rename = "response.function_call_arguments.delta")] ResponseFunctionCallArgumentsDelta(ResponseFunctionCallArgumentsDelta), #[serde(rename = "response.function_call_arguments.done")] ResponseFunctionCallArgumentsDone(ResponseFunctionCallArgumentsDone), + #[serde(rename = "response.mcp_call_arguments.delta")] + ResponseMcpCallArgumentsDelta(ResponseMcpCallArgumentsDelta), + #[serde(rename = "response.mcp_call_arguments.done")] + ResponseMcpCallArgumentsDone(ResponseMcpCallArgumentsDone), + #[serde(rename = "response.mcp_call.in_progress")] + ResponseMcpCallInProgress(ResponseMcpCallInProgress), + #[serde(rename = "response.mcp_call.completed")] + ResponseMcpCallCompleted(ResponseMcpCallCompleted), + #[serde(rename = "response.mcp_call.failed")] + ResponseMcpCallFailed(ResponseMcpCallFailed), #[serde(rename = "rate_limits.updated")] RateLimitsUpdated(RateLimitsUpdated), + #[serde(rename = "mcp_list_tools.in_progress")] + McpListToolsInProgress(McpListToolsInProgress), + #[serde(rename = "mcp_list_tools.completed")] + McpListToolsCompleted(McpListToolsCompleted), + #[serde(rename = "mcp_list_tools.failed")] + McpListToolsFailed(McpListToolsFailed), } diff --git a/src/realtime/types.rs b/src/realtime/types.rs index a90ff27f..7b386ba1 100644 --- a/src/realtime/types.rs +++ b/src/realtime/types.rs @@ -1,29 +1,87 @@ +use std::collections::HashMap; + use serde::{Deserialize, Serialize}; +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "lowercase", tag = "type")] +pub enum Session { + Realtime(RealtimeSession), + Transcription(TranscriptionSession), +} +impl Default for Session { + fn default() -> Self { + Self::Realtime(Default::default()) + } +} + #[derive(Debug, Serialize, Deserialize, Clone, Default)] -pub struct Session { +pub struct TranscriptionSession { #[serde(skip_serializing_if = "Option::is_none")] - pub modalities: Option>, + pub audio: Option, #[serde(skip_serializing_if = "Option::is_none")] - pub instructions: Option, + pub include: Option>, +} + +#[derive(Debug, Serialize, Deserialize, Clone, Default)] +pub struct RealtimeSession { #[serde(skip_serializing_if = "Option::is_none")] - pub voice: Option, + pub audio: Option, #[serde(skip_serializing_if = "Option::is_none")] - pub input_audio_format: Option, + pub include: Option>, #[serde(skip_serializing_if = "Option::is_none")] - pub output_audio_format: Option, + pub model: Option, + /// Just `Audio` by default. Can also be `Text` for text-only. Both at the same time are not supported. #[serde(skip_serializing_if = "Option::is_none")] - pub input_audio_transcription: Option, + pub output_modalities: Option>, #[serde(skip_serializing_if = "Option::is_none")] - pub turn_detection: Option, + pub instructions: Option, #[serde(skip_serializing_if = "Option::is_none")] pub tools: Option>, #[serde(skip_serializing_if = "Option::is_none")] pub tool_choice: Option, #[serde(skip_serializing_if = "Option::is_none")] - pub temperature: Option, - #[serde(skip_serializing_if = "Option::is_none")] pub max_output_tokens: Option, + // Todo: Support prompt template reference and variables + // #[serde(skip_serializing_if = "Option::is_none")] + // pub prompt: Option, + // Todo: Support tracing config + // #[serde(skip_serializing_if = "Option::is_none")] + // pub tracing: Option, // "auto" or config object + // Todo: Support truncation config (poorly documented atm) + // #[serde(skip_serializing_if = "Option::is_none")] + // pub tracing: Option, // "auto" or config object +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub enum RealtimeModel { + #[serde(rename = "gpt-realtime")] + GptRealtime, + #[serde(rename = "gpt-4o-realtime-preview")] + Gpt4oRealtimePreview, + #[serde(rename = "gpt-4o-mini-realtime-preview")] + Gpt4oMiniRealtimePreview, + #[serde(rename = "gpt-realtime-2025-08-28")] + GptRealtime20250828, + #[serde(rename = "gpt-4o-realtime-preview-2024-12-17")] + Gpt4oRealtimePreview20241217, + #[serde(rename = "gpt-4o-realtime-preview-2024-10-01")] + Gpt4oRealtimePreview20241001, + #[serde(rename = "gpt-4o-mini-realtime-preview-2024-12-17")] + Gpt4oMiniRealtimePreview20241217, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub enum AdditionalServerOutput { + /// Include logprobs for input audio transcription. + #[serde(rename = "item.input_audio_transcription.logprobs")] + Logprobs, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "lowercase")] +pub enum OutputModality { + Audio, + Text, } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -32,20 +90,158 @@ pub enum RealtimeVoice { Alloy, Ash, Ballad, + Cedar, Coral, Echo, + Marin, Sage, Shimmer, Verse, } #[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AudioConfig { + pub input: AudioInput, + pub output: AudioOutput, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AudioInput { + pub format: AudioFormat, + /// Configuration for input audio noise reduction. This can be set to null to turn off. Noise reduction filters audio added to the input audio buffer before it is sent to VAD and the model. + /// Filtering the audio can improve VAD and turn detection accuracy (reducing false positives) and model performance by improving perception of the input audio. + pub noise_reduction: Option, + /// Configuration for input audio transcription, defaults to off and can be set to null to turn off once on. Input audio transcription is not native to the model, since the model consumes audio directly. Transcription runs asynchronously through the /audio/transcriptions endpoint and should be treated as guidance of input audio content rather than precisely what the model heard. The client can optionally set the language and prompt for transcription, these offer additional guidance to the transcription service. + pub transcription: Option, + /// Configuration for turn detection, ether Server VAD or Semantic VAD. This can be set to null to turn off, in which case the client must manually trigger model response. + pub turn_detection: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct TranscriptionConfig { + /// The language of the input audio in ISO-639-1 (e.g. "en") format. Will improve accuracy and latency if set. + #[serde(skip_serializing_if = "Option::is_none")] + pub language: Option, + pub model: TranscriptionModel, + /// An optional text to guide the model's style or continue a previous audio segment. For `whisper-1`, the prompt is a list of keywords. For `gpt-4o-transcribe` models, the prompt is a free text string, for example "expect words related to technology". + #[serde(skip_serializing_if = "Option::is_none")] + pub prompt: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub enum TranscriptionModel { + #[serde(rename = "whisper-1")] + Whisper1, + #[serde(rename = "gpt-4o-transcribe-latest")] + Gpt4oTranscribeLatest, + #[serde(rename = "gpt-4o-mini-transcribe")] + Gpt4oMiniTranscribe, + #[serde(rename = "gpt-4o-transcribe")] + Gpt4oTranscribe, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "snake_case")] +pub enum VadMode { + SemanticVad(SemanticVadConfig), +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ServerVadConfig { + /// Whether or not to automatically generate a response when a VAD stop event occurs. + pub create_response: bool, + /// Optional timeout after which a model response will be triggered automatically. This is useful for situations in which a long pause from the user is unexpected, such as a phone call. The model will effectively prompt the user to continue the conversation based on the current context. + /// The timeout value will be applied after the last model response's audio has finished playing, i.e. it's set to the `response.done` time plus audio playback duration. + /// An `input_audio_buffer.timeout_triggered` event (plus events associated with the Response) will be emitted when the timeout is reached. Idle timeout is currently only supported for server_vad mode. + pub idle_timeout_ms: Option, + /// Whether or not to automatically interrupt any ongoing response with output to the default conversation (i.e. `conversation` of `auto`) when a VAD start event occurs. + pub interrupt_response: bool, + /// Used only for server_vad mode. Amount of audio to include before the VAD detected speech (in milliseconds). Defaults to 300ms. + #[serde(skip_serializing_if = "Option::is_none")] + pub prefix_padding_ms: Option, + /// Used only for server_vad mode. Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. With shorter values the model will respond more quickly, but may jump in on short pauses from the user. + #[serde(skip_serializing_if = "Option::is_none")] + pub silence_duration_ms: Option, + /// Used only for server_vad mode. Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher threshold will require louder audio to activate the model, and thus might perform better in noisy environments. + #[serde(skip_serializing_if = "Option::is_none")] + pub threshold: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct SemanticVadConfig { + /// Whether or not to automatically generate a response when a VAD stop event occurs. + pub create_response: bool, + pub eagerness: SemanticVadEagerness, + /// Whether or not to automatically interrupt any ongoing response with output to the default conversation (i.e. `conversation` of `auto`) when a VAD start event occurs. + pub interrupt_response: bool, +} + +/// low will wait longer for the user to continue speaking, high will respond more quickly. auto is the default and is equivalent to medium. low, medium, and high have max timeouts of 8s, 4s, and 2s respectively. +#[derive(Debug, Serialize, Deserialize, Clone)] +pub enum SemanticVadEagerness { + /// Equivalent to Medium. + Auto, + Low, + Medium, + High, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct NoiseReduction { + #[serde(rename = "type")] + pub reduction_type: NoiseReductionType, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "snake_case")] +pub enum NoiseReductionType { + /// `near_field` is for close-talking microphones such as headphones + NearField, + /// `far_field` is for far-field microphones such as laptop or conference room microphones + FarField, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AudioOutput { + pub format: AudioFormat, + /// The speed of the model's spoken response as a multiple of the original speed. 1.0 is the default speed. 0.25 is the minimum speed. 1.5 is the maximum speed. This value can only be changed in between model turns, not while a response is in progress. + /// This parameter is a post-processing adjustment to the audio after it is generated, it's also possible to prompt the model to speak faster or slower. + pub speed: f64, + /// The voice the model uses to respond. Voice cannot be changed during the session once the model has responded with audio at least once. + #[serde(skip_serializing_if = "Option::is_none")] + pub voice: Option, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(untagged)] pub enum AudioFormat { - #[serde(rename = "pcm16")] - PCM16, - #[serde(rename = "g711_ulaw")] + Pcm(AudioFormatDefinitionWithSampleRate), + Other(AudioFormatDefinition), +} + +/// This form of audio format definition is *only* used for the raw PCM format. +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AudioFormatDefinitionWithSampleRate { + /// This must always be `24000` for PCM. + pub rate: i32, + /// Must be `Pcm`. + #[serde(rename = "type")] + pub audio_type: AudioFormatIdentifier, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AudioFormatDefinition { + #[serde(rename = "type")] + pub audio_type: AudioFormatIdentifier, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub enum AudioFormatIdentifier { + #[serde(rename = "audio/pcm")] + Pcm, + #[serde(rename = "audio/pcmu")] G711ULAW, - #[serde(rename = "g711_alaw")] + #[serde(rename = "audio/pcma")] G711ALAW, } @@ -60,22 +256,93 @@ pub struct AudioTranscription { #[serde(tag = "type")] pub enum TurnDetection { #[serde(rename = "server_vad")] - ServerVAD { - threshold: f32, - prefix_padding_ms: u32, - silence_duration_ms: u32, - }, + ServerVAD(ServerVadConfig), + SemanticVAD(SemanticVadConfig), } #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(tag = "type")] +#[serde(rename_all = "lowercase")] pub enum ToolDefinition { - #[serde(rename = "function")] Function { name: String, description: String, parameters: serde_json::Value, }, + Mcp { + server_label: String, + allowed_tools: McpAllowedTools, + /// An OAuth access token that can be used with a remote MCP server, either with a custom MCP server URL or a service connector. Your application must handle the OAuth authorization flow and provide the token here. + #[serde(skip_serializing_if = "Option::is_none")] + authorization: Option, + /// One of server_url or connector_id must be provided but not both. + #[serde(skip_serializing_if = "Option::is_none")] + connector_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + headers: Option>, + /// Specify which of the MCP server's tools require approval. + require_approval: McpApprovalSettings, + #[serde(skip_serializing_if = "Option::is_none")] + server_description: Option, + /// One of server_url or connector_id must be provided but not both. + #[serde(skip_serializing_if = "Option::is_none")] + server_url: Option, + }, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(untagged)] +pub enum McpApprovalSettings { + Filter(McpApprovalFilter), + SinglePolicy(McpApprovalMode), +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct McpApprovalFilter { + always: McpFilterObject, + never: McpFilterObject, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "lowercase")] +pub enum McpApprovalMode { + /// Always require approval + Always, + /// Never require approval + Never, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(untagged)] +pub enum McpAllowedTools { + FilterObject(McpFilterObject), + ToolNames(Vec), +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct McpFilterObject { + read_only: bool, + tool_names: Vec, +} + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub enum Connector { + #[serde(rename = "connector_dropbox")] + ConnectorDropbox, + #[serde(rename = "connector_gmail")] + ConnectorGmail, + #[serde(rename = "connector_googlecalendar")] + ConnectorGoogleCalendar, + #[serde(rename = "connector_googledrive")] + ConnectorGoogleDrive, + #[serde(rename = "connector_microsoftteams")] + ConnectorMicrosoftTeams, + #[serde(rename = "connector_outlookcalendar")] + ConnectorOutlookCalendar, + #[serde(rename = "connector_outlookemail")] + ConnectorOutlookEmail, + #[serde(rename = "connector_sharepoint")] + ConnectorSharepoint, } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -89,6 +356,12 @@ pub enum ToolChoice { r#type: FunctionType, name: String, }, + #[serde(untagged)] + Mcp { + r#type: McpType, + name: String, + server_label: String, + }, } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -97,23 +370,33 @@ pub enum FunctionType { Function, } +#[derive(Debug, Serialize, Deserialize, Clone)] +#[serde(rename_all = "lowercase")] +pub enum McpType { + Mcp, +} + #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(untagged)] pub enum MaxOutputTokens { + Inf(String), Num(u16), - #[serde(rename = "inf")] - Inf, } -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] #[serde(rename_all = "snake_case")] pub enum ItemType { Message, FunctionCall, FunctionCallOutput, + McpApprovalResponse, + McpListTools, + #[serde(rename = "mcp_call")] // not consistent with the docs + McpToolCall, + McpApprovalRequest, } -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] #[serde(rename_all = "snake_case")] pub enum ItemStatus { Completed, @@ -121,7 +404,7 @@ pub enum ItemStatus { Incomplete, } -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] #[serde(rename_all = "lowercase")] pub enum ItemRole { User, @@ -129,13 +412,13 @@ pub enum ItemRole { System, } -#[derive(Debug, Serialize, Deserialize, Clone)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] #[serde(rename_all = "snake_case")] pub enum ItemContentType { InputText, InputAudio, Text, - Audio, + OutputAudio, } #[derive(Debug, Serialize, Deserialize, Clone)] @@ -144,7 +427,7 @@ pub struct ItemContent { #[serde(skip_serializing_if = "Option::is_none")] pub text: Option, #[serde(skip_serializing_if = "Option::is_none")] - pub audio: Option, + pub output_audio: Option, #[serde(skip_serializing_if = "Option::is_none")] pub transcript: Option, } @@ -153,6 +436,9 @@ pub struct ItemContent { pub struct Item { #[serde(skip_serializing_if = "Option::is_none")] pub id: Option, + // Generic to all Item types: + #[serde(skip_serializing_if = "Option::is_none")] + pub previous_item_id: Option, #[serde(skip_serializing_if = "Option::is_none")] pub r#type: Option, #[serde(skip_serializing_if = "Option::is_none")] @@ -167,8 +453,30 @@ pub struct Item { pub name: Option, #[serde(skip_serializing_if = "Option::is_none")] pub arguments: Option, + // found both in function and MCP tool calls #[serde(skip_serializing_if = "Option::is_none")] pub output: Option, + // fields specific to approval request items: + #[serde(skip_serializing_if = "Option::is_none")] + pub approval_request_id: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub approve: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub reason: Option, + // common to all MCP items: + #[serde(skip_serializing_if = "Option::is_none")] + pub server_label: Option, + // specific to MCP tool list: + // "name", server_label is already there + #[serde(skip_serializing_if = "Option::is_none")] + pub tools: Option>, + // to MCP tool call: + // arguments already there from the deprecated "functions" functionality + // id, name, server_label, approval_request_id, output already there + #[serde(skip_serializing_if = "Option::is_none")] + pub error: Option, + // specific to MCP approval request: + // arguments, id, name, server_label all already there } impl TryFrom for Item { @@ -273,3 +581,12 @@ pub struct RateLimit { pub remaining: u32, pub reset_seconds: f32, } + +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct McpToolListing { + pub input_schema: serde_json::Value, + pub name: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub annotations: Option, + pub description: String, +} diff --git a/src/v1/api.rs b/src/v1/api.rs index ce0b1c4e..1defa82d 100644 --- a/src/v1/api.rs +++ b/src/v1/api.rs @@ -32,6 +32,7 @@ use crate::v1::message::{ }; use crate::v1::model::{ModelResponse, ModelsResponse}; use crate::v1::moderation::{CreateModerationRequest, CreateModerationResponse}; +use crate::v1::realtime_calls::{AcceptCallRequest, ReferCallRequest}; use crate::v1::run::{ CreateRunRequest, CreateThreadAndRunRequest, ListRun, ListRunStep, ModifyRunRequest, RunObject, RunStepObject, @@ -64,7 +65,7 @@ pub struct OpenAIClientBuilder { headers: Option, } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct OpenAIClient { api_endpoint: String, api_key: Option, @@ -194,6 +195,32 @@ impl OpenAIClient { self.handle_response(response).await } + /// `POST`s but expects an empty response rather than anything to deserialize. + async fn post_empty( + &mut self, + path: &str, + body: &impl serde::ser::Serialize, + ) -> Result<(), APIError> { + let request = self.build_request(Method::POST, path).await; + let request = request.json(body); + let response = request.send().await?; + + if response.status().is_success() { + let headers = response.headers().clone(); + self.response_headers = Some(headers); + Ok(()) + } else { + let status = response.status(); + let error_message = response + .text() + .await + .unwrap_or_else(|_| format!("Unknown error - no body text was provided")); + Err(APIError::CustomError { + message: format!("{status}: {error_message}"), + }) + } + } + async fn get(&mut self, path: &str) -> Result { let request = self.build_request(Method::GET, path).await; let response = request.send().await?; @@ -796,6 +823,46 @@ impl OpenAIClient { self.delete(&format!("models/{model_id}")).await } + pub async fn accept_call( + &mut self, + call_id: &str, + accept: AcceptCallRequest, + ) -> Result<(), APIError> { + // /realtime/calls endpoints return empty responses on success + self.post_empty(&format!("realtime/calls/{call_id}/accept"), &accept) + .await + } + + pub async fn hangup_call(&mut self, call_id: &str) -> Result<(), APIError> { + // /realtime/calls endpoints return empty responses on success + self.post_empty(&format!("realtime/calls/{call_id}/hangup"), &()) + .await + } + + /// Note that `reject_call` is very poorly documented and seems to be non-functional even in the GA release as of 2025-09-11: + /// + /// - it returns a 404 if there is no session associated with the call (ie. it hasn't been `accept`ed yet) + /// - it returns a 500 if there *is* one + /// - in neither case does the call actually end + /// + /// Per https://community.openai.com/t/how-can-i-programatically-end-a-gpt-realtime-sip-call/1355362 a `hangup` method exists, not documented elsewhere; + /// a sensible workaround is to `accept` the call and immediately `hangup`. See `hangup_call`. + pub async fn reject_call(&mut self, call_id: &str) -> Result<(), APIError> { + // ditto WRT successful body + self.post_empty(&format!("realtime/calls/{call_id}/reject"), &()) + .await + } + + pub async fn refer_call( + &mut self, + call_id: &str, + refer: ReferCallRequest, + ) -> Result<(), APIError> { + // ditto WRT successful body + self.post_empty(&format!("realtime/calls/{call_id}/refer"), &refer) + .await + } + fn build_url_with_preserved_query(&self, path: &str) -> Result { let (base, query_opt) = match self.api_endpoint.split_once('?') { Some((b, q)) => (b.trim_end_matches('/'), Some(q)), diff --git a/src/v1/mod.rs b/src/v1/mod.rs index d44ed319..13e3d9c5 100644 --- a/src/v1/mod.rs +++ b/src/v1/mod.rs @@ -13,6 +13,7 @@ pub mod fine_tuning; pub mod image; pub mod model; pub mod moderation; +pub mod realtime_calls; // beta pub mod assistant; diff --git a/src/v1/realtime_calls.rs b/src/v1/realtime_calls.rs new file mode 100644 index 00000000..97887558 --- /dev/null +++ b/src/v1/realtime_calls.rs @@ -0,0 +1,22 @@ +use serde::{Deserialize, Serialize}; + +use crate::realtime::types::Session; + +/// Used to start a realtime session based on an incoming call that you can then connect to over WSS with `RealtimeSipClient` from `openai_api_rs::realtime::sip`. +/// Note that this is poorly documented by OpenAI with the only example data given in https://platform.openai.com/docs/guides/realtime-sip#handle-the-webhook. +/// +/// Per an OpenAI dev (https://community.openai.com/t/how-to-setup-transcription-on-realtime-api-with-sip/1355068/12) anything that can be passed to `session.update` over WSS can be passed to /accept, +/// as well as `model`, ordinarily reserved for `session.create`. +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct AcceptCallRequest { + /// The session must *always* be a `realtime` one. + #[serde(flatten)] + pub session: Session, +} + +/// Used to redirect a call to another number. Per https://platform.openai.com/docs/guides/realtime-sip#handle-the-webhook the Tel-URI scheme may be used. +#[derive(Debug, Serialize, Deserialize, Clone)] +pub struct ReferCallRequest { + /// The URI to redirect the call to, for example `tel:+14152909007` + pub target_uri: String, +}