@@ -57,6 +57,7 @@ def create(
5757 * ,
5858 file : FileTypes ,
5959 model : Union [str , AudioModel ],
60+ chunking_strategy : Optional [transcription_create_params .ChunkingStrategy ] | NotGiven = NOT_GIVEN ,
6061 include : List [TranscriptionInclude ] | NotGiven = NOT_GIVEN ,
6162 response_format : Union [Literal ["json" ], NotGiven ] = NOT_GIVEN ,
6263 language : str | NotGiven = NOT_GIVEN ,
@@ -118,6 +119,7 @@ def create(
118119 file : FileTypes ,
119120 model : Union [str , AudioModel ],
120121 stream : Literal [True ],
122+ chunking_strategy : Optional [transcription_create_params .ChunkingStrategy ] | NotGiven = NOT_GIVEN ,
121123 include : List [TranscriptionInclude ] | NotGiven = NOT_GIVEN ,
122124 language : str | NotGiven = NOT_GIVEN ,
123125 prompt : str | NotGiven = NOT_GIVEN ,
@@ -152,6 +154,11 @@ def create(
152154
153155 Note: Streaming is not supported for the `whisper-1` model and will be ignored.
154156
157+ chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
158+ first normalizes loudness and then uses voice activity detection (VAD) to choose
159+ boundaries. `server_vad` object can be provided to tweak VAD detection
160+ parameters manually. If unset, the audio is transcribed as a single block.
161+
155162 include: Additional information to include in the transcription response. `logprobs` will
156163 return the log probabilities of the tokens in the response to understand the
157164 model's confidence in the transcription. `logprobs` only works with
@@ -200,6 +207,7 @@ def create(
200207 file : FileTypes ,
201208 model : Union [str , AudioModel ],
202209 stream : bool ,
210+ chunking_strategy : Optional [transcription_create_params .ChunkingStrategy ] | NotGiven = NOT_GIVEN ,
203211 include : List [TranscriptionInclude ] | NotGiven = NOT_GIVEN ,
204212 language : str | NotGiven = NOT_GIVEN ,
205213 prompt : str | NotGiven = NOT_GIVEN ,
@@ -234,6 +242,11 @@ def create(
234242
235243 Note: Streaming is not supported for the `whisper-1` model and will be ignored.
236244
245+ chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
246+ first normalizes loudness and then uses voice activity detection (VAD) to choose
247+ boundaries. `server_vad` object can be provided to tweak VAD detection
248+ parameters manually. If unset, the audio is transcribed as a single block.
249+
237250 include: Additional information to include in the transcription response. `logprobs` will
238251 return the log probabilities of the tokens in the response to understand the
239252 model's confidence in the transcription. `logprobs` only works with
@@ -281,6 +294,7 @@ def create(
281294 * ,
282295 file : FileTypes ,
283296 model : Union [str , AudioModel ],
297+ chunking_strategy : Optional [transcription_create_params .ChunkingStrategy ] | NotGiven = NOT_GIVEN ,
284298 include : List [TranscriptionInclude ] | NotGiven = NOT_GIVEN ,
285299 language : str | NotGiven = NOT_GIVEN ,
286300 prompt : str | NotGiven = NOT_GIVEN ,
@@ -299,6 +313,7 @@ def create(
299313 {
300314 "file" : file ,
301315 "model" : model ,
316+ "chunking_strategy" : chunking_strategy ,
302317 "include" : include ,
303318 "language" : language ,
304319 "prompt" : prompt ,
@@ -357,6 +372,8 @@ async def create(
357372 * ,
358373 file : FileTypes ,
359374 model : Union [str , AudioModel ],
375+ chunking_strategy : Optional [transcription_create_params .ChunkingStrategy ] | NotGiven = NOT_GIVEN ,
376+ include : List [TranscriptionInclude ] | NotGiven = NOT_GIVEN ,
360377 response_format : Union [Literal ["json" ], NotGiven ] = NOT_GIVEN ,
361378 language : str | NotGiven = NOT_GIVEN ,
362379 prompt : str | NotGiven = NOT_GIVEN ,
@@ -369,7 +386,68 @@ async def create(
369386 extra_query : Query | None = None ,
370387 extra_body : Body | None = None ,
371388 timeout : float | httpx .Timeout | None | NotGiven = NOT_GIVEN ,
372- ) -> Transcription : ...
389+ ) -> TranscriptionCreateResponse :
390+ """
391+ Transcribes audio into the input language.
392+
393+ Args:
394+ file:
395+ The audio file object (not file name) to transcribe, in one of these formats:
396+ flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
397+
398+ model: ID of the model to use. The options are `gpt-4o-transcribe`,
399+ `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
400+ Whisper V2 model).
401+
402+ chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
403+ first normalizes loudness and then uses voice activity detection (VAD) to choose
404+ boundaries. `server_vad` object can be provided to tweak VAD detection
405+ parameters manually. If unset, the audio is transcribed as a single block.
406+
407+ include: Additional information to include in the transcription response. `logprobs` will
408+ return the log probabilities of the tokens in the response to understand the
409+ model's confidence in the transcription. `logprobs` only works with
410+ response_format set to `json` and only with the models `gpt-4o-transcribe` and
411+ `gpt-4o-mini-transcribe`.
412+
413+ language: The language of the input audio. Supplying the input language in
414+ [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
415+ format will improve accuracy and latency.
416+
417+ prompt: An optional text to guide the model's style or continue a previous audio
418+ segment. The
419+ [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
420+ should match the audio language.
421+
422+ response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
423+ `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
424+ the only supported format is `json`.
425+
426+ stream: If set to true, the model response data will be streamed to the client as it is
427+ generated using
428+ [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
429+ See the
430+ [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
431+ for more information.
432+
433+ Note: Streaming is not supported for the `whisper-1` model and will be ignored.
434+
435+ temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
436+ output more random, while lower values like 0.2 will make it more focused and
437+ deterministic. If set to 0, the model will use
438+ [log probability](https://en.wikipedia.org/wiki/Log_probability) to
439+ automatically increase the temperature until certain thresholds are hit.
440+
441+ timestamp_granularities: The timestamp granularities to populate for this transcription.
442+ `response_format` must be set `verbose_json` to use timestamp granularities.
443+ Either or both of these options are supported: `word`, or `segment`. Note: There
444+ is no additional latency for segment timestamps, but generating word timestamps
445+ incurs additional latency.
446+
447+ extra_headers: Send extra headers
448+
449+ extra_query: Add additional query parameters to the request
450+ """
373451
374452 @overload
375453 async def create (
@@ -418,6 +496,7 @@ async def create(
418496 file : FileTypes ,
419497 model : Union [str , AudioModel ],
420498 stream : Literal [True ],
499+ chunking_strategy : Optional [transcription_create_params .ChunkingStrategy ] | NotGiven = NOT_GIVEN ,
421500 include : List [TranscriptionInclude ] | NotGiven = NOT_GIVEN ,
422501 language : str | NotGiven = NOT_GIVEN ,
423502 prompt : str | NotGiven = NOT_GIVEN ,
@@ -452,6 +531,11 @@ async def create(
452531
453532 Note: Streaming is not supported for the `whisper-1` model and will be ignored.
454533
534+ chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
535+ first normalizes loudness and then uses voice activity detection (VAD) to choose
536+ boundaries. `server_vad` object can be provided to tweak VAD detection
537+ parameters manually. If unset, the audio is transcribed as a single block.
538+
455539 include: Additional information to include in the transcription response. `logprobs` will
456540 return the log probabilities of the tokens in the response to understand the
457541 model's confidence in the transcription. `logprobs` only works with
@@ -500,6 +584,7 @@ async def create(
500584 file : FileTypes ,
501585 model : Union [str , AudioModel ],
502586 stream : bool ,
587+ chunking_strategy : Optional [transcription_create_params .ChunkingStrategy ] | NotGiven = NOT_GIVEN ,
503588 include : List [TranscriptionInclude ] | NotGiven = NOT_GIVEN ,
504589 language : str | NotGiven = NOT_GIVEN ,
505590 prompt : str | NotGiven = NOT_GIVEN ,
@@ -534,6 +619,11 @@ async def create(
534619
535620 Note: Streaming is not supported for the `whisper-1` model and will be ignored.
536621
622+ chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
623+ first normalizes loudness and then uses voice activity detection (VAD) to choose
624+ boundaries. `server_vad` object can be provided to tweak VAD detection
625+ parameters manually. If unset, the audio is transcribed as a single block.
626+
537627 include: Additional information to include in the transcription response. `logprobs` will
538628 return the log probabilities of the tokens in the response to understand the
539629 model's confidence in the transcription. `logprobs` only works with
@@ -581,6 +671,7 @@ async def create(
581671 * ,
582672 file : FileTypes ,
583673 model : Union [str , AudioModel ],
674+ chunking_strategy : Optional [transcription_create_params .ChunkingStrategy ] | NotGiven = NOT_GIVEN ,
584675 include : List [TranscriptionInclude ] | NotGiven = NOT_GIVEN ,
585676 language : str | NotGiven = NOT_GIVEN ,
586677 prompt : str | NotGiven = NOT_GIVEN ,
@@ -599,6 +690,7 @@ async def create(
599690 {
600691 "file" : file ,
601692 "model" : model ,
693+ "chunking_strategy" : chunking_strategy ,
602694 "include" : include ,
603695 "language" : language ,
604696 "prompt" : prompt ,
0 commit comments