Updated audio

djthorpe · djthorpe · commit 88001259000a · 2025-02-11T14:01:42.000+01:00
diff --git a/7283d94c6a7e5bc91e7875ccf51a96d3.m2a b/7283d94c6a7e5bc91e7875ccf51a96d3.m2a
diff --git a/README.md b/README.md
@@ -443,7 +443,7 @@ flag. For example, to have the model generate a caption for the image in the fil
 the following command:
 
 ```bash
-llm complete --model gpt-4o --file picture.png "Explain this image"
+llm complete --file picture.png "Explain this image"
 ```
 
 ### Generate an image
@@ -456,7 +456,22 @@ the following command:
 llm complete --model dall-e-3 --format image "A picture of a cat"
 ```
 
-It will write the file in the current working directory.
+Flags `--size`, `--quality` and `--style` can be used to specify the image parameters. It will write the image
+file in the current working directory.
+
+### Convert text to speech
+
+To have a model generate text from speech:
+
+```bash
+echo book.txt | llm complete --model tts-1 --format mp3 --voice coral
+```
+
+It will write the audio file in the current working directory. You can currently write
+the following audio formats and voices:
+
+* Formats: `--format mp3`, `--format opus`, `--format aac`, `--format flac`, `--format wav`, `--format pcm`
+* Voices: `--voice alloy`, `--voice ash`, `--voice coral`, `--voice echo`, `--voice fable`, `--voice onyx`, `--voice nova`, `--voice sage`, `--voice shimmer`
 
 ## Contributing & Distribution
 
diff --git a/attachment.go b/attachment.go
@@ -23,6 +23,7 @@ type AttachmentMeta struct {
 	ExpiresAt uint64 `json:"expires_at,omitempty"`
 	Caption   string `json:"transcript,omitempty"`
 	Data      []byte `json:"data"`
+	Type      string `json:"type"`
 }
 
 // OpenAI image metadata
@@ -57,19 +58,23 @@ func NewAttachmentWithImage(image *ImageMeta) *Attachment {
 
 // ReadAttachment returns an attachment from a reader object.
 // It is the responsibility of the caller to close the reader.
-func ReadAttachment(r io.Reader) (*Attachment, error) {
-	var filename string
+func ReadAttachment(r io.Reader, mimetype ...string) (*Attachment, error) {
+	var filename, typ string
 	data, err := io.ReadAll(r)
 	if err != nil {
 		return nil, err
 	}
 	if f, ok := r.(*os.File); ok {
 		filename = f.Name()
 	}
+	if len(mimetype) > 0 {
+		typ = mimetype[0]
+	}
 	return &Attachment{
 		meta: &AttachmentMeta{
 			Filename: filename,
 			Data:     data,
+			Type:     typ,
 		},
 	}, nil
 }
@@ -176,6 +181,11 @@ func (a *Attachment) Caption() string {
 // on the data and/or filename extension. Returns an empty string if
 // there is no data or filename
 func (a *Attachment) Type() string {
+	// If there's a mimetype set, use this
+	if a.meta != nil && a.meta.Type != "" {
+		return a.meta.Type
+	}
+
 	// If there's no data or filename, return empty
 	if len(a.Data()) == 0 && a.Filename() == "" {
 		return ""
@@ -191,9 +201,9 @@ func (a *Attachment) Type() string {
 	}
 
 	// Mimetype based on filename
-	if a.Filename() != "" {
+	if a.meta != nil && a.meta.Filename != "" {
 		// Detect mimetype from extension
-		mimetype = mime.TypeByExtension(filepath.Ext(a.Filename()))
+		mimetype = mime.TypeByExtension(filepath.Ext(a.meta.Filename))
 	}
 
 	// Return the default mimetype
diff --git a/cmd/llm/complete.go b/cmd/llm/complete.go
@@ -2,14 +2,15 @@ package main
 
 import (
 	"context"
+	"encoding/json"
 	"fmt"
 	"io"
 	"os"
 	"strings"
 
 	// Packages
 	llm "github.com/mutablelogic/go-llm"
-	"github.com/mutablelogic/go-llm/pkg/openai"
+	openai "github.com/mutablelogic/go-llm/pkg/openai"
 )
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -89,14 +90,23 @@ func (cmd *CompleteCmd) Run(globals *Globals) error {
 		completion, err := model.Completion(ctx, string(prompt), opts...)
 		if err != nil {
 			return err
+		} else if completion == nil {
+			return llm.ErrInternalServerError.Withf("No completion returned")
 		}
 
 		// Print the completion - text
 		if cmd.NoStream {
 			fmt.Println(completion.Text(0))
+		} else {
+			fmt.Println()
 		}
 
 		// Output completion attachments
+		type Result struct {
+			Filename string `json:"filename"`
+			Caption  string `json:"caption,omitempty"`
+		}
+		var out []Result
 		for i := 0; i < completion.Num(); i++ {
 			attachment := completion.Attachment(i)
 			if attachment == nil {
@@ -113,9 +123,21 @@ func (cmd *CompleteCmd) Run(globals *Globals) error {
 
 			if _, err := f.Write(attachment.Data()); err != nil {
 				return err
-			} else {
-				fmt.Printf("%q written to %s\n", attachment.Caption(), attachment.Filename())
 			}
+
+			out = append(out, Result{
+				Filename: attachment.Filename(),
+				Caption:  attachment.Caption(),
+			})
+		}
+
+		// Print the completion - attachments
+		if len(out) > 0 {
+			data, err := json.MarshalIndent(out, "", "  ")
+			if err != nil {
+				return err
+			}
+			fmt.Println(string(data))
 		}
 
 		// Return success
diff --git a/opt_format.go b/opt_format.go
@@ -8,7 +8,6 @@ import "strings"
 const (
 	mimeTypeText = "text/plain"
 	mimeTypeJSON = "application/json"
-	mimeTypeJpeg = "image/jpeg"
 	mimeTypeMP3  = "audio/mpeg"
 	mimeTypeOpus = "audio/opus"
 	mimeTypeAAC  = "audio/aac"
@@ -25,21 +24,20 @@ var (
 		mimeTypeJSON:  "json_object",
 		"json":        "json_object",
 		"json_object": "json_object",
-		mimeTypeJpeg:  "image",
-		"jpeg":        "image",
 		"image":       "image",
-		mimeTypeMP3:   "mp3",
-		mimeTypeOpus:  "opus",
-		mimeTypeAAC:   "aac",
-		mimeTypeFLAC:  "flac",
-		mimeTypeWAV:   "wav",
-		mimeTypePCM:   "pcm",
-		"mp3":         "mp3",
-		"opus":        "opus",
-		"aac":         "aac",
-		"flac":        "flac",
-		"wav":         "wav",
-		"pcm":         "pcm",
+		mimeTypeMP3:   "audio",
+		mimeTypeOpus:  "audio",
+		mimeTypeAAC:   "audio",
+		mimeTypeFLAC:  "audio",
+		mimeTypeWAV:   "audio",
+		mimeTypePCM:   "audio",
+		"audio":       "audio",
+		"mp3":         "audio",
+		"opus":        "audio",
+		"aac":         "audio",
+		"flac":        "audio",
+		"wav":         "audio",
+		"pcm":         "audio",
 	}
 	audioValues = []string{
 		"mp3", "opus", "aac", "flac", "wav", "pcm",
diff --git a/pkg/openai/audio.go b/pkg/openai/audio.go
@@ -1 +1,99 @@
 package openai
+
+import (
+	"context"
+	"io"
+
+	// Packages
+	client "github.com/mutablelogic/go-client"
+	llm "github.com/mutablelogic/go-llm"
+)
+
+///////////////////////////////////////////////////////////////////////////////
+// PRIVATE METHODS
+
+type reqAudioCompletion struct {
+	Model          string  `json:"model"`
+	Input          string  `json:"input"`
+	Voice          string  `json:"voice"`
+	Speed          float64 `json:"speed,omitempty"`
+	ResponseFormat string  `json:"response_format,omitempty"`
+}
+
+type responseAudio struct {
+	audio *llm.Attachment
+}
+
+// Send a completion request with text for text-to-speech
+func (model *model) audioCompletion(ctx context.Context, input string, opt *llm.Opts) (llm.Completion, error) {
+	// Request
+	req, err := client.NewJSONRequest(reqAudioCompletion{
+		Model:          model.Name(),
+		Input:          input,
+		Voice:          optVoice(opt),
+		Speed:          optSpeed(opt),
+		ResponseFormat: optAudioFormat(opt),
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	// Response
+	var response responseAudio
+	if err := model.DoWithContext(ctx, req, &response, client.OptPath("audio", "speech")); err != nil {
+		return nil, err
+	}
+
+	return &response, nil
+}
+
+func (resp *responseAudio) Unmarshal(mimetype string, r io.Reader) error {
+	// Unmarshal the response
+	attachment, err := llm.ReadAttachment(r, mimetype)
+	if err != nil {
+		return err
+	} else {
+		resp.audio = attachment
+	}
+	return nil
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// COMPLETION
+
+// Return the number of completions
+func (r *responseAudio) Num() int {
+	return 1
+}
+
+// Return message for a specific completion
+func (r *responseAudio) Choice(index int) llm.Completion {
+	if index != 0 {
+		return nil
+	}
+	return r
+}
+
+// Return the role of the completion
+func (r *responseAudio) Role() string {
+	return "assistant"
+}
+
+// Unsupported
+func (r *responseAudio) Text(index int) string {
+	return ""
+}
+
+// Return media content for a specific completion
+func (r *responseAudio) Attachment(index int) *llm.Attachment {
+	if index != 0 {
+		return nil
+	} else {
+		return r.audio
+	}
+}
+
+// Unsupported
+func (r *responseAudio) ToolCalls(index int) []llm.ToolCall {
+	return nil
+}
diff --git a/pkg/openai/image.go b/pkg/openai/image.go
@@ -66,12 +66,6 @@ func (model *model) imageCompletion(ctx context.Context, prompt string, opt *llm
 	return &response, nil
 }
 
-// Send a completion request with text for a text-to-speech completion
-// TODO
-func (model *model) audioCompletion(ctx context.Context, prompt string, opt *llm.Opts) (llm.Completion, error) {
-	return nil, llm.ErrNotImplemented
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 // PRIVATE METHODS
 
diff --git a/pkg/openai/opt.go b/pkg/openai/opt.go
@@ -133,6 +133,17 @@ func WithAudio(voice, format string) llm.Opt {
 	}
 }
 
+// Parameters for speech output
+func WithAudioSpeed(v float64) llm.Opt {
+	return func(o *llm.Opts) error {
+		if v < 0.25 || v > 4.0 {
+			return llm.ErrBadParameter.With("speed")
+		}
+		o.Set("speed", v)
+		return nil
+	}
+}
+
 // Parameters for image output
 func WithSize(v string) llm.Opt {
 	return func(o *llm.Opts) error {
@@ -264,9 +275,14 @@ func optPrediction(opts *llm.Opts) *Content {
 }
 
 func optAudio(opts *llm.Opts) *Audio {
-	if v, ok := opts.Get("audio").(*Audio); ok {
+	v, ok := opts.Get("audio").(*Audio)
+	if ok {
 		return v
 	}
+	if v == nil {
+		opts.Set("audio", NewAudio("ash", "mp3"))
+		return optAudio(opts)
+	}
 	return nil
 }
 
diff --git a/pkg/openai/opt_audio.go b/pkg/openai/opt_audio.go
@@ -1,6 +1,10 @@
 package openai
 
-import "strings"
+import (
+	"strings"
+
+	"github.com/mutablelogic/go-llm"
+)
 
 ///////////////////////////////////////////////////////////////////////////////
 // TYPES
@@ -11,6 +15,9 @@ type Audio struct {
 
 	// Supported formats: wav, mp3, flac, opus, or pcm16
 	Format string `json:"format"`
+
+	// Return the speed
+	Speed float64 `json:"speed,omitempty"`
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -24,3 +31,30 @@ func NewAudio(voice, format string) *Audio {
 	}
 	return &Audio{Voice: voice, Format: format}
 }
+
+///////////////////////////////////////////////////////////////////////////////
+// PRIVATE METHODS
+
+func optVoice(opts *llm.Opts) string {
+	if audio := optAudio(opts); audio != nil {
+		return audio.Voice
+	} else {
+		return ""
+	}
+}
+
+func optSpeed(opts *llm.Opts) float64 {
+	if audio := optAudio(opts); audio != nil {
+		return audio.Speed
+	} else {
+		return 1.0
+	}
+}
+
+func optAudioFormat(opts *llm.Opts) string {
+	if audio := optAudio(opts); audio != nil {
+		return audio.Format
+	} else {
+		return "mp3"
+	}
+}