From 32827b47260bc7285c0f088d0b167618f165abd9 Mon Sep 17 00:00:00 2001 From: Scott Willeke Date: Thu, 20 Feb 2025 20:34:41 -0800 Subject: [PATCH] feat: The "session request" is no longer managed by the client. The hosting application is expected to manage the session request. This has presented a cleaner and more reliable API for us in production. BREAKING CHANGE: The sessionRequest argument is no longer part of the constructor. The sessionRequested argument is no longe provided to the getRealtimeEphemeralAPIKey callback. The host application should know what session it wants and can always request whatever session it wants in the getRealtimeEphemeralAPIKey. See the WebRTC example for how to do this. --- .../src/pages/WebRTCExample.tsx | 39 +++++--- packages/browser/src/WebRTC/RealtimeClient.ts | 94 +++++-------------- packages/browser/src/openai/index.ts | 3 + 3 files changed, 57 insertions(+), 79 deletions(-) diff --git a/apps/browser-example/src/pages/WebRTCExample.tsx b/apps/browser-example/src/pages/WebRTCExample.tsx index b427100..7dbb0bd 100644 --- a/apps/browser-example/src/pages/WebRTCExample.tsx +++ b/apps/browser-example/src/pages/WebRTCExample.tsx @@ -5,7 +5,10 @@ import { } from "../components/RealtimeSessionView" import { RealtimeClient } from "@tsorta/browser/WebRTC" import { PageProps } from "./props" -import { RealtimeConversationItem } from "@tsorta/browser/openai" +import { + RealtimeConversationItem, + RealtimeSessionCreateResponse, +} from "@tsorta/browser/openai" export function WebRTCExample({ apiKey, @@ -37,17 +40,25 @@ export function WebRTCExample({ const client = new RealtimeClient( navigator, - // @ts-expect-error TS6133: 'sessionRequested' is declared but its value is never read. - ({ sessionRequested }) => { + async () => { // NOTE: For the sake of the example, we're using a "real" OpenAI API - // key rather than a Realtime API Session ephemeral key, as you - // should do in a production app. So this sessionRequested argument - // isn't useful in the example, but in a production app you can use - // it to request a session with the these parameters. - return apiKey + // key in *the browser*. **DO NOT DO THIS**. You should make this request + // for the ephemeral key on a backend server where you can protect + // the key. + + const r = await fetch("https://api.openai.com/v1/realtime/sessions", { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify(sessionRequest), + }) + const data = (await r.json()) as RealtimeSessionCreateResponse + + return data.client_secret.value }, - audioElementRef.current, - sessionRequest + audioElementRef.current ) setClient(client) @@ -59,7 +70,13 @@ export function WebRTCExample({ setConversation(event.conversation) }) - await client.start() + try { + await client.start() + } catch (e) { + // TODO: put an alert on the top to show error + console.error("Error starting session", e) + return + } onSessionStatusChanged("recording") }, diff --git a/packages/browser/src/WebRTC/RealtimeClient.ts b/packages/browser/src/WebRTC/RealtimeClient.ts index b8fa96d..408620e 100644 --- a/packages/browser/src/WebRTC/RealtimeClient.ts +++ b/packages/browser/src/WebRTC/RealtimeClient.ts @@ -53,9 +53,6 @@ const RealtimeClientDefaultOptions: RealtimeClientOptions = { baseUrl: "https://api.openai.com/v1/realtime", } -interface EphemeralApiKeyOptions { - sessionRequested: RealtimeSessionCreateRequest -} /** * A TypeScript client for the OpenAI Realtime API using WebRTC in the browser. */ @@ -78,16 +75,12 @@ export class RealtimeClient { /** * Create a new client. * @param getRealtimeEphemeralAPIKey This is a function that you should implement to return the Ephemeral OpenAI API key that is used to authenticate with the OpenAI Realtime API. It should be an ephemeral key as described at https://platform.openai.com/docs/guides/realtime-webrtc#creating-an-ephemeral-token. You will probably need to make a call to your server here to fetch the key. - * @param sessionRequested The session parameters you want from the Realtime API. If these are found to be different it will re-request them to try to match this session. */ constructor( private readonly navigator: Navigator, - private readonly getRealtimeEphemeralAPIKey: ( - options: EphemeralApiKeyOptions - ) => Promise | string, + private readonly getRealtimeEphemeralAPIKey: () => Promise | string, private readonly audioElement: HTMLAudioElement, - private readonly sessionRequested: RealtimeSessionCreateRequest, - options: Partial = RealtimeClientDefaultOptions + options: Partial = RealtimeClientDefaultOptions, ) { const opt = { ...RealtimeClientDefaultOptions, ...options } this.recordedAudioChunkDuration = opt.recordedAudioChunkDuration @@ -101,7 +94,7 @@ export class RealtimeClient { */ public addEventListener( event: TEventName, - listener: EventTargetListener + listener: EventTargetListener, ): void { this.emitter.addEventListener(event, listener) } @@ -204,7 +197,7 @@ export class RealtimeClient { this.audioChunks.push(...audioChunks) this.emitter.dispatchTypedEvent( "recordedAudioChanged", - new RecordedAudioChangedEvent(this.audioChunks) + new RecordedAudioChangedEvent(this.audioChunks), ) } @@ -212,7 +205,7 @@ export class RealtimeClient { this.audioChunks = audioChunks this.emitter.dispatchTypedEvent( "recordedAudioChanged", - new RecordedAudioChangedEvent(this.audioChunks) + new RecordedAudioChangedEvent(this.audioChunks), ) } @@ -225,9 +218,7 @@ export class RealtimeClient { let apiKey: string try { - apiKey = await this.getRealtimeEphemeralAPIKey({ - sessionRequested: this.sessionRequested, - }) + apiKey = await this.getRealtimeEphemeralAPIKey() } catch (err) { throw new Error("getRealtimeEphemeralAPIKey handler failed.", { cause: err, @@ -274,7 +265,7 @@ export class RealtimeClient { // Listen for server-sent events on the data channel this.dataChannel.addEventListener( "message", - this.receiveServerMessage.bind(this) + this.receiveServerMessage.bind(this), ) this.dataChannel.addEventListener("error", (e) => { log.error("Data channel error from server: %o", e.error) @@ -308,7 +299,7 @@ export class RealtimeClient { this.session = undefined this.emitter.dispatchTypedEvent( "sessionUpdated", - new SessionUpdatedEvent(this.session) + new SessionUpdatedEvent(this.session), ) } } @@ -323,7 +314,7 @@ export class RealtimeClient { this.emitter.dispatchTypedEvent( "serverEvent", - new RealtimeServerEventEvent(parsedEvent) + new RealtimeServerEventEvent(parsedEvent), ) } @@ -383,49 +374,15 @@ export class RealtimeClient { client.session = sessionEvent.session client.emitter.dispatchTypedEvent( "sessionCreated", - new SessionCreatedEvent(sessionEvent.session) + new SessionCreatedEvent(sessionEvent.session), ) - - if (!client.sessionRequested) { - throw new Error("No session request") - } - - // NOTE: When we create a session with OpenAI, it ignores things like input_audio_transcription?.model !== "whisper-1"; So we update it again if it doesn't match the session. - let updatedSession: RealtimeSessionCreateRequest = { - ...client.sessionRequested, - } - let hasSessionMismatch = false - - for (const key of Object.keys(client.sessionRequested) as Array< - keyof RealtimeSessionCreateRequest - >) { - const requestValue = client.sessionRequested[key] - const sessionValue = sessionEvent.session[key] - - if (compareValuesIgnoreNullProperties(requestValue, sessionValue)) { - continue - } - log.debug( - `session mismatch on ${key}: %o !== %o`, - requestValue, - sessionValue - ) - hasSessionMismatch = true - } - if (hasSessionMismatch) { - const updateSessionEvent: RealtimeClientEventSessionUpdate = { - type: "session.update", - session: updatedSession, - } - client.sendClientEvent(updateSessionEvent) - } }, "session.updated": (client, event) => { const sessionEvent = event as RealtimeServerEventSessionUpdated client.session = sessionEvent.session client.emitter.dispatchTypedEvent( "sessionUpdated", - new SessionUpdatedEvent(sessionEvent.session) + new SessionUpdatedEvent(sessionEvent.session), ) }, "conversation.item.created": (client, event) => { @@ -434,7 +391,7 @@ export class RealtimeClient { client.conversation.push(conversationEvent.item) client.emitter.dispatchTypedEvent( "conversationChanged", - new ConversationChangedEvent(client.conversation) + new ConversationChangedEvent(client.conversation), ) }, "response.audio_transcript.delta": (client, event) => { @@ -445,7 +402,7 @@ export class RealtimeClient { client.conversation, deltaEvent.item_id, deltaEvent.content_index, - deltaEvent + deltaEvent, ) if (!foundItem) { // error was logged in findConversationItemContent @@ -462,7 +419,7 @@ export class RealtimeClient { } else { if (foundContent.type !== "input_audio") { log.error( - `${event.type} Unexpected content type ${foundContent.type} for audio transcript` + `${event.type} Unexpected content type ${foundContent.type} for audio transcript`, ) return } @@ -470,7 +427,7 @@ export class RealtimeClient { } client.emitter.dispatchTypedEvent( "conversationChanged", - new ConversationChangedEvent(client.conversation) + new ConversationChangedEvent(client.conversation), ) }, "response.text.delta": (client, event) => { @@ -500,7 +457,7 @@ export class RealtimeClient { { log }, client.conversation, output.id!, - event + event, ) if (!conversationItem) { // TODO: findConversationItem already logged an error, we should probably pass in a value that tells it not to log @@ -508,7 +465,7 @@ export class RealtimeClient { client.conversation.push(output) client.emitter.dispatchTypedEvent( "conversationChanged", - new ConversationChangedEvent(client.conversation) + new ConversationChangedEvent(client.conversation), ) continue } @@ -523,7 +480,7 @@ export class RealtimeClient { // force update the conversation state: client.emitter.dispatchTypedEvent( "conversationChanged", - new ConversationChangedEvent(client.conversation) + new ConversationChangedEvent(client.conversation), ) } }, @@ -531,35 +488,36 @@ export class RealtimeClient { patchConversationItemWithCompletedTranscript( { log }, client.conversation, - event as RealtimeServerEventResponseAudioTranscriptDone + event as RealtimeServerEventResponseAudioTranscriptDone, ) client.emitter.dispatchTypedEvent( "conversationChanged", - new ConversationChangedEvent(client.conversation) + new ConversationChangedEvent(client.conversation), ) }, "conversation.item.input_audio_transcription.completed": ( client, - event + event, ) => { patchConversationItemWithCompletedTranscript( { log }, client.conversation, - event + event, ) client.emitter.dispatchTypedEvent( "conversationChanged", - new ConversationChangedEvent(client.conversation) + new ConversationChangedEvent(client.conversation), ) }, } } type RealtimeServerEventHandler< - TRealtimeServerEventType extends RealtimeServerEvent["type"] = RealtimeServerEvent["type"] + TRealtimeServerEventType extends + RealtimeServerEvent["type"] = RealtimeServerEvent["type"], > = ( client: RealtimeClient, - event: Extract + event: Extract, ) => void type RealtimeServerEventNames = RealtimeServerEvent["type"] diff --git a/packages/browser/src/openai/index.ts b/packages/browser/src/openai/index.ts index 596a12e..ce0e471 100644 --- a/packages/browser/src/openai/index.ts +++ b/packages/browser/src/openai/index.ts @@ -96,6 +96,9 @@ export type RealtimeSession = components["schemas"]["RealtimeSession"] export type RealtimeSessionCreateRequest = components["schemas"]["RealtimeSessionCreateRequest"] +export type RealtimeSessionCreateResponse = + components["schemas"]["RealtimeSessionCreateResponse"] + export type RealTimeSessionModels = RealtimeSessionCreateRequest["model"] /** Part of the @see RealtimeServerEventResponseDone event and others.