Skip to content

Commit 2b195b3

Browse files
apappascsmarkpollack
authored andcommitted
refactor(tts): migrate OpenAI TTS to shared TextToSpeechModel interface and standardize on Double for speed
BREAKING CHANGE: OpenAI TTS now uses shared interfaces instead of provider-specific classes - Replace deprecated OpenAI-specific classes (SpeechModel, SpeechPrompt, SpeechResponse, StreamingSpeechModel) with shared interfaces (TextToSpeechModel, TextToSpeechPrompt, TextToSpeechResponse, StreamingTextToSpeechModel) - Update interface hierarchy: TextToSpeechModel now extends StreamingTextToSpeechModel for consistency with ChatModel pattern - Add @FunctionalInterface annotation to StreamingTextToSpeechModel - Migrate speed property from Float to Double across all OpenAI TTS components for consistency with ElevenLabs API and better type uniformity - Update OpenAiAudioSpeechOptions to implement TextToSpeechOptions interface - Add TextToSpeechModelTests with 7 comprehensive unit tests for interface behavior - Improve null safety in TextToSpeechModel.call(String) default method - Update documentation to reflect new interface hierarchy and Double usage - Add migration guide to documentation for upgrading from deprecated classes Signed-off-by: Alexandros Pappas <apappascs@gmail.com>
1 parent 2a2f155 commit 2b195b3

File tree

18 files changed

+495
-533
lines changed

18 files changed

+495
-533
lines changed

auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ public class OpenAiAudioSpeechProperties extends OpenAiParentProperties {
3838

3939
public static final String DEFAULT_SPEECH_MODEL = OpenAiAudioApi.TtsModel.GPT_4_O_MINI_TTS.getValue();
4040

41-
private static final Float SPEED = 1.0f;
41+
private static final Double SPEED = 1.0;
4242

4343
private static final String VOICE = OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue();
4444

models/spring-ai-elevenlabs/src/main/java/org/springframework/ai/elevenlabs/ElevenLabsTextToSpeechModel.java

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import reactor.core.publisher.Flux;
2424

2525
import org.springframework.ai.audio.tts.Speech;
26-
import org.springframework.ai.audio.tts.StreamingTextToSpeechModel;
2726
import org.springframework.ai.audio.tts.TextToSpeechModel;
2827
import org.springframework.ai.audio.tts.TextToSpeechPrompt;
2928
import org.springframework.ai.audio.tts.TextToSpeechResponse;
@@ -35,12 +34,11 @@
3534
import org.springframework.util.MultiValueMap;
3635

3736
/**
38-
* Implementation of the {@link TextToSpeechModel} and {@link StreamingTextToSpeechModel}
39-
* interfaces
37+
* Implementation of the {@link TextToSpeechModel} interface for ElevenLabs TTS API.
4038
*
4139
* @author Alexandros Pappas
4240
*/
43-
public class ElevenLabsTextToSpeechModel implements TextToSpeechModel, StreamingTextToSpeechModel {
41+
public class ElevenLabsTextToSpeechModel implements TextToSpeechModel {
4442

4543
private final Logger logger = LoggerFactory.getLogger(getClass());
4644

models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java

Lines changed: 32 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,20 @@
1616

1717
package org.springframework.ai.openai;
1818

19+
import java.util.List;
20+
1921
import org.slf4j.Logger;
2022
import org.slf4j.LoggerFactory;
2123
import reactor.core.publisher.Flux;
2224

25+
import org.springframework.ai.audio.tts.Speech;
26+
import org.springframework.ai.audio.tts.TextToSpeechModel;
27+
import org.springframework.ai.audio.tts.TextToSpeechOptions;
28+
import org.springframework.ai.audio.tts.TextToSpeechPrompt;
29+
import org.springframework.ai.audio.tts.TextToSpeechResponse;
2330
import org.springframework.ai.chat.metadata.RateLimit;
2431
import org.springframework.ai.openai.api.OpenAiAudioApi;
2532
import org.springframework.ai.openai.api.OpenAiAudioApi.SpeechRequest.AudioResponseFormat;
26-
import org.springframework.ai.openai.audio.speech.Speech;
27-
import org.springframework.ai.openai.audio.speech.SpeechModel;
28-
import org.springframework.ai.openai.audio.speech.SpeechPrompt;
29-
import org.springframework.ai.openai.audio.speech.SpeechResponse;
30-
import org.springframework.ai.openai.audio.speech.StreamingSpeechModel;
3133
import org.springframework.ai.openai.metadata.audio.OpenAiAudioSpeechResponseMetadata;
3234
import org.springframework.ai.openai.metadata.support.OpenAiResponseHeaderExtractor;
3335
import org.springframework.ai.retry.RetryUtils;
@@ -46,13 +48,13 @@
4648
* @see OpenAiAudioApi
4749
* @since 1.0.0-M1
4850
*/
49-
public class OpenAiAudioSpeechModel implements SpeechModel, StreamingSpeechModel {
51+
public class OpenAiAudioSpeechModel implements TextToSpeechModel {
5052

5153
/**
5254
* The speed of the default voice synthesis.
5355
* @see OpenAiAudioSpeechOptions
5456
*/
55-
private static final Float SPEED = 1.0f;
57+
private static final Double SPEED = 1.0;
5658

5759
private final Logger logger = LoggerFactory.getLogger(getClass());
5860

@@ -118,14 +120,14 @@ public OpenAiAudioSpeechModel(OpenAiAudioApi audioApi, OpenAiAudioSpeechOptions
118120

119121
@Override
120122
public byte[] call(String text) {
121-
SpeechPrompt speechRequest = new SpeechPrompt(text);
122-
return call(speechRequest).getResult().getOutput();
123+
TextToSpeechPrompt prompt = new TextToSpeechPrompt(text);
124+
return call(prompt).getResult().getOutput();
123125
}
124126

125127
@Override
126-
public SpeechResponse call(SpeechPrompt speechPrompt) {
128+
public TextToSpeechResponse call(TextToSpeechPrompt prompt) {
127129

128-
OpenAiAudioApi.SpeechRequest speechRequest = createRequest(speechPrompt);
130+
OpenAiAudioApi.SpeechRequest speechRequest = createRequest(prompt);
129131

130132
ResponseEntity<byte[]> speechEntity = this.retryTemplate
131133
.execute(ctx -> this.audioApi.createSpeech(speechRequest));
@@ -134,48 +136,42 @@ public SpeechResponse call(SpeechPrompt speechPrompt) {
134136

135137
if (speech == null) {
136138
logger.warn("No speech response returned for speechRequest: {}", speechRequest);
137-
return new SpeechResponse(new Speech(new byte[0]));
139+
return new TextToSpeechResponse(List.of(new Speech(new byte[0])));
138140
}
139141

140142
RateLimit rateLimits = OpenAiResponseHeaderExtractor.extractAiResponseHeaders(speechEntity);
141143

142-
return new SpeechResponse(new Speech(speech), new OpenAiAudioSpeechResponseMetadata(rateLimits));
144+
return new TextToSpeechResponse(List.of(new Speech(speech)), new OpenAiAudioSpeechResponseMetadata(rateLimits));
143145
}
144146

145147
/**
146148
* Streams the audio response for the given speech prompt.
147-
* @param speechPrompt The speech prompt containing the text and options for speech
149+
* @param prompt The speech prompt containing the text and options for speech
148150
* synthesis.
149-
* @return A Flux of SpeechResponse objects containing the streamed audio and
151+
* @return A Flux of TextToSpeechResponse objects containing the streamed audio and
150152
* metadata.
151153
*/
152154
@Override
153-
public Flux<SpeechResponse> stream(SpeechPrompt speechPrompt) {
155+
public Flux<TextToSpeechResponse> stream(TextToSpeechPrompt prompt) {
154156

155-
OpenAiAudioApi.SpeechRequest speechRequest = createRequest(speechPrompt);
157+
OpenAiAudioApi.SpeechRequest speechRequest = createRequest(prompt);
156158

157159
Flux<ResponseEntity<byte[]>> speechEntity = this.retryTemplate
158160
.execute(ctx -> this.audioApi.stream(speechRequest));
159161

160-
return speechEntity.map(entity -> new SpeechResponse(new Speech(entity.getBody()),
162+
return speechEntity.map(entity -> new TextToSpeechResponse(List.of(new Speech(entity.getBody())),
161163
new OpenAiAudioSpeechResponseMetadata(OpenAiResponseHeaderExtractor.extractAiResponseHeaders(entity))));
162164
}
163165

164-
private OpenAiAudioApi.SpeechRequest createRequest(SpeechPrompt request) {
165-
OpenAiAudioSpeechOptions options = this.defaultOptions;
166-
167-
if (request.getOptions() != null) {
168-
if (request.getOptions() instanceof OpenAiAudioSpeechOptions runtimeOptions) {
169-
options = this.merge(runtimeOptions, options);
170-
}
171-
else {
172-
throw new IllegalArgumentException("Prompt options are not of type SpeechOptions: "
173-
+ request.getOptions().getClass().getSimpleName());
174-
}
175-
}
166+
private OpenAiAudioApi.SpeechRequest createRequest(TextToSpeechPrompt prompt) {
167+
OpenAiAudioSpeechOptions runtimeOptions = (prompt
168+
.getOptions() instanceof OpenAiAudioSpeechOptions openAiAudioSpeechOptions) ? openAiAudioSpeechOptions
169+
: null;
170+
OpenAiAudioSpeechOptions options = (runtimeOptions != null) ? this.merge(runtimeOptions, this.defaultOptions)
171+
: this.defaultOptions;
176172

177173
String input = StringUtils.hasText(options.getInput()) ? options.getInput()
178-
: request.getInstructions().getText();
174+
: prompt.getInstructions().getText();
179175

180176
OpenAiAudioApi.SpeechRequest.Builder requestBuilder = OpenAiAudioApi.SpeechRequest.builder()
181177
.model(options.getModel())
@@ -187,6 +183,11 @@ private OpenAiAudioApi.SpeechRequest createRequest(SpeechPrompt request) {
187183
return requestBuilder.build();
188184
}
189185

186+
@Override
187+
public TextToSpeechOptions getDefaultOptions() {
188+
return this.defaultOptions;
189+
}
190+
190191
private OpenAiAudioSpeechOptions merge(OpenAiAudioSpeechOptions source, OpenAiAudioSpeechOptions target) {
191192
OpenAiAudioSpeechOptions.Builder mergedBuilder = OpenAiAudioSpeechOptions.builder();
192193

models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechOptions.java

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import com.fasterxml.jackson.annotation.JsonInclude;
2020
import com.fasterxml.jackson.annotation.JsonProperty;
2121

22-
import org.springframework.ai.model.ModelOptions;
22+
import org.springframework.ai.audio.tts.TextToSpeechOptions;
2323
import org.springframework.ai.openai.api.OpenAiAudioApi.SpeechRequest.AudioResponseFormat;
2424
import org.springframework.ai.openai.api.OpenAiAudioApi.SpeechRequest.Voice;
2525

@@ -33,7 +33,7 @@
3333
* @since 1.0.0-M1
3434
*/
3535
@JsonInclude(JsonInclude.Include.NON_NULL)
36-
public class OpenAiAudioSpeechOptions implements ModelOptions {
36+
public class OpenAiAudioSpeechOptions implements TextToSpeechOptions {
3737

3838
/**
3939
* ID of the model to use for generating the audio. For OpenAI's TTS API, use one of
@@ -67,7 +67,7 @@ public class OpenAiAudioSpeechOptions implements ModelOptions {
6767
* 4.0 (fastest). Defaults to 1 (normal)
6868
*/
6969
@JsonProperty("speed")
70-
private Float speed;
70+
private Double speed;
7171

7272
public static Builder builder() {
7373
return new Builder();
@@ -109,14 +109,34 @@ public void setResponseFormat(AudioResponseFormat responseFormat) {
109109
this.responseFormat = responseFormat;
110110
}
111111

112-
public Float getSpeed() {
112+
@Override
113+
public Double getSpeed() {
113114
return this.speed;
114115
}
115116

116-
public void setSpeed(Float speed) {
117+
public void setSpeed(Double speed) {
117118
this.speed = speed;
118119
}
119120

121+
// TextToSpeechOptions interface methods
122+
123+
@Override
124+
public String getFormat() {
125+
return (this.responseFormat != null) ? this.responseFormat.name().toLowerCase() : null;
126+
}
127+
128+
@Override
129+
@SuppressWarnings("unchecked")
130+
public OpenAiAudioSpeechOptions copy() {
131+
return OpenAiAudioSpeechOptions.builder()
132+
.model(this.model)
133+
.input(this.input)
134+
.voice(this.voice)
135+
.responseFormat(this.responseFormat)
136+
.speed(this.speed)
137+
.build();
138+
}
139+
120140
@Override
121141
public int hashCode() {
122142
final int prime = 31;
@@ -217,7 +237,7 @@ public Builder responseFormat(AudioResponseFormat responseFormat) {
217237
return this;
218238
}
219239

220-
public Builder speed(Float speed) {
240+
public Builder speed(Double speed) {
221241
this.options.speed = speed;
222242
return this;
223243
}

models/spring-ai-openai/src/main/java/org/springframework/ai/openai/api/OpenAiAudioApi.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ public record SpeechRequest(
393393
@JsonProperty("input") String input,
394394
@JsonProperty("voice") String voice,
395395
@JsonProperty("response_format") AudioResponseFormat responseFormat,
396-
@JsonProperty("speed") Float speed) {
396+
@JsonProperty("speed") Double speed) {
397397
// @formatter:on
398398

399399
public static Builder builder() {
@@ -488,7 +488,7 @@ public static final class Builder {
488488

489489
private AudioResponseFormat responseFormat = AudioResponseFormat.MP3;
490490

491-
private Float speed;
491+
private Double speed;
492492

493493
public Builder model(String model) {
494494
this.model = model;
@@ -515,7 +515,7 @@ public Builder responseFormat(AudioResponseFormat responseFormat) {
515515
return this;
516516
}
517517

518-
public Builder speed(Float speed) {
518+
public Builder speed(Double speed) {
519519
this.speed = speed;
520520
return this;
521521
}

models/spring-ai-openai/src/main/java/org/springframework/ai/openai/audio/speech/Speech.java

Lines changed: 0 additions & 82 deletions
This file was deleted.

0 commit comments

Comments
 (0)