diff --git a/.github/workflows/cicd.yml b/.github/workflows/cicd.yml index fcf0922ce..18b4e8510 100644 --- a/.github/workflows/cicd.yml +++ b/.github/workflows/cicd.yml @@ -35,16 +35,20 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - ruby-version: ['3.1', '3.2', '3.3', '3.4', 'jruby-10.0.1.0'] - rails-version: ['rails-7.1', 'rails-7.2', 'rails-8.0'] + ruby-version: ['3.1', '3.2', '3.3', '3.4', 'jruby-10.0.2.0'] + rails-version: ['rails-7.1', 'rails-7.2', 'rails-8.0', 'rails-8.1'] exclude: # Rails 8 requires Ruby 3.2+ - ruby-version: '3.1' rails-version: 'rails-8.0' + - ruby-version: '3.1' + rails-version: 'rails-8.1' # JRuby only supports up to 7.1 right now - - ruby-version: 'jruby-10.0.1.0' + - ruby-version: 'jruby-10.0.2.0' + rails-version: 'rails-8.1' + - ruby-version: 'jruby-10.0.2.0' rails-version: 'rails-8.0' - - ruby-version: 'jruby-10.0.1.0' + - ruby-version: 'jruby-10.0.2.0' rails-version: 'rails-7.2' steps: @@ -200,4 +204,4 @@ jobs: fi } env: - GEM_HOST_API_KEY: "${{secrets.RUBYGEMS_AUTH_TOKEN}}" \ No newline at end of file + GEM_HOST_API_KEY: "${{secrets.RUBYGEMS_AUTH_TOKEN}}" diff --git a/.gitignore b/.gitignore index 37c1a60b7..2e00ef551 100644 --- a/.gitignore +++ b/.gitignore @@ -48,6 +48,7 @@ build-iPhoneSimulator/ # for a library or gem, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: Gemfile.lock +gemfiles/*.lock # .ruby-version # .ruby-gemset diff --git a/Appraisals b/Appraisals index e555c8e71..f953e00be 100644 --- a/Appraisals +++ b/Appraisals @@ -17,3 +17,9 @@ appraise 'rails-8.0' do gem 'rails', '~> 8.0.0' end end + +appraise 'rails-8.1' do + group :development do + gem 'rails', '~> 8.1.0' + end +end diff --git a/README.md b/README.md index c0b8fcd50..825367c74 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,11 @@ RubyLLM.embed "Ruby is elegant and expressive" RubyLLM.transcribe "meeting.wav" ``` +```ruby +# Text to speech +RubyLLM.tts "Hello, welcome to RubyLLM!" +``` + ```ruby # Moderate content for safety RubyLLM.moderate "Check if this text is safe" diff --git a/docs/_core_features/text-to-speech.md b/docs/_core_features/text-to-speech.md new file mode 100644 index 000000000..11bc48315 --- /dev/null +++ b/docs/_core_features/text-to-speech.md @@ -0,0 +1,96 @@ +--- +layout: default +title: Text to Speech +nav_order: 7 +description: Convert text to speech +redirect_from: + - /guides/audio-transcription + - /guides/transcription +--- + +# {{ page.title }} +{: .d-inline-block .no_toc } + +v1.9.0+ +{: .label .label-green } + +{{ page.description }} +{: .fs-6 .fw-300 } + +## Table of contents +{: .no_toc .text-delta } + +1. TOC +{:toc} + +--- + +After reading this guide, you will know: + +* How to generate speech from text. +* How to save audio files. +* How to select different voices. +* How to access raw audio data. +* Specifics of language support. + +## Basic Text to Speech + +Generate audio with the global `RubyLLM.tts` method: + +```ruby +audio = RubyLLM.tts("Hello, welcome to RubyLLM!") + +``` + +## Save Audio File +You can save the generated audio to a file. +If you are using OpenAI, the audio will be saved as an MP3 file. + +```ruby +audio = RubyLLM.tts("This is a text to speech example.", provider: :openai, model: "gpt-4o-mini-tts") +audio.save("example.mp3") +``` + +If you are using Gemini, the audio will be saved as a raw PCM file. + +```ruby +audio = RubyLLM.tts("This is a text to speech example.", provider: :gemini, model: "gemini-2.5-flash-preview-tts") +audio.save("example.pcm") +``` + +You can convert it to MP3 using ffmpeg: + +```bash +ffmpeg -f s16le -ar 24000 -ac 1 -i example.pcm example.mp3 +``` + +### Select Voice +You can specify different voices. Supported voices for OpenAI +are alloy, ash, ballad, coral, echo, fable, onyx, nova, sage, shimmer, and verse. + +For Gemini have a look at the [gemini voices](https://ai.google.dev/gemini-api/docs/speech-generation#voices). + +```ruby +# Using a specific voice +voice = "ash" +audio = RubyLLM.tts("Hello, this is a #{voice}`s voice.", voice: voice) +``` + +### Access Audio Data +You can access the raw audio data: + +```ruby +audio = RubyLLM.tts("Accessing raw audio data.") +audio.data # => binary audio data (MP3 for OpenAI, PCM for Gemini) +``` + +### Language Support +OpenAi and Gemini gather language support automatically based on the text provided. +Previously, you could specify the language manually in Gemini. + +## Next Steps + +* [Chatting with AI Models]({% link _core_features/chat.md %}): Learn about conversational AI. +* [Image Generation]({% link _core_features/image-generation.md %}): Generate images from text. +* [Error Handling]({% link _advanced/error-handling.md %}): Master handling API errors. + diff --git a/docs/index.md b/docs/index.md index 3006598e1..8ef9d6032 100644 --- a/docs/index.md +++ b/docs/index.md @@ -138,6 +138,11 @@ RubyLLM.embed "Ruby is elegant and expressive" RubyLLM.transcribe "meeting.wav" ``` +```ruby +# Text to speech +RubyLLM.tts "Hello, welcome to RubyLLM!" +``` + ```ruby # Moderate content for safety RubyLLM.moderate "Check if this text is safe" diff --git a/gemfiles/rails_7.1.gemfile.lock b/gemfiles/rails_7.1.gemfile.lock deleted file mode 100644 index 5095b296c..000000000 --- a/gemfiles/rails_7.1.gemfile.lock +++ /dev/null @@ -1,421 +0,0 @@ -PATH - remote: .. - specs: - ruby_llm (1.8.2) - base64 - event_stream_parser (~> 1) - faraday (>= 1.10.0) - faraday-multipart (>= 1) - faraday-net_http (>= 1) - faraday-retry (>= 1) - marcel (~> 1.0) - zeitwerk (~> 2) - -GEM - remote: https://rubygems.org/ - specs: - actioncable (7.1.5.2) - actionpack (= 7.1.5.2) - activesupport (= 7.1.5.2) - nio4r (~> 2.0) - websocket-driver (>= 0.6.1) - zeitwerk (~> 2.6) - actionmailbox (7.1.5.2) - actionpack (= 7.1.5.2) - activejob (= 7.1.5.2) - activerecord (= 7.1.5.2) - activestorage (= 7.1.5.2) - activesupport (= 7.1.5.2) - mail (>= 2.7.1) - net-imap - net-pop - net-smtp - actionmailer (7.1.5.2) - actionpack (= 7.1.5.2) - actionview (= 7.1.5.2) - activejob (= 7.1.5.2) - activesupport (= 7.1.5.2) - mail (~> 2.5, >= 2.5.4) - net-imap - net-pop - net-smtp - rails-dom-testing (~> 2.2) - actionpack (7.1.5.2) - actionview (= 7.1.5.2) - activesupport (= 7.1.5.2) - nokogiri (>= 1.8.5) - racc - rack (>= 2.2.4) - rack-session (>= 1.0.1) - rack-test (>= 0.6.3) - rails-dom-testing (~> 2.2) - rails-html-sanitizer (~> 1.6) - actiontext (7.1.5.2) - actionpack (= 7.1.5.2) - activerecord (= 7.1.5.2) - activestorage (= 7.1.5.2) - activesupport (= 7.1.5.2) - globalid (>= 0.6.0) - nokogiri (>= 1.8.5) - actionview (7.1.5.2) - activesupport (= 7.1.5.2) - builder (~> 3.1) - erubi (~> 1.11) - rails-dom-testing (~> 2.2) - rails-html-sanitizer (~> 1.6) - activejob (7.1.5.2) - activesupport (= 7.1.5.2) - globalid (>= 0.3.6) - activemodel (7.1.5.2) - activesupport (= 7.1.5.2) - activerecord (7.1.5.2) - activemodel (= 7.1.5.2) - activesupport (= 7.1.5.2) - timeout (>= 0.4.0) - activestorage (7.1.5.2) - actionpack (= 7.1.5.2) - activejob (= 7.1.5.2) - activerecord (= 7.1.5.2) - activesupport (= 7.1.5.2) - marcel (~> 1.0) - activesupport (7.1.5.2) - base64 - benchmark (>= 0.3) - bigdecimal - concurrent-ruby (~> 1.0, >= 1.0.2) - connection_pool (>= 2.2.5) - drb - i18n (>= 1.6, < 2) - logger (>= 1.4.2) - minitest (>= 5.1) - mutex_m - securerandom (>= 0.3) - tzinfo (~> 2.0) - addressable (2.8.7) - public_suffix (>= 2.0.2, < 7.0) - appraisal (2.5.0) - bundler - rake - thor (>= 0.14.0) - ast (2.4.3) - async (2.34.0) - console (~> 1.29) - fiber-annotation - io-event (~> 1.11) - metrics (~> 0.12) - traces (~> 0.18) - base64 (0.3.0) - benchmark (0.5.0) - bigdecimal (3.3.1) - builder (3.3.0) - childprocess (5.1.0) - logger (~> 1.5) - codecov (0.2.12) - json - simplecov - coderay (1.1.3) - concurrent-ruby (1.3.5) - connection_pool (2.5.4) - console (1.34.2) - fiber-annotation - fiber-local (~> 1.1) - json - crack (1.0.0) - bigdecimal - rexml - crass (1.0.6) - date (3.4.1) - diff-lcs (1.6.2) - docile (1.4.1) - dotenv (3.1.8) - drb (2.2.3) - erb (5.1.1) - erubi (1.13.1) - event_stream_parser (1.0.0) - faraday (2.14.0) - faraday-net_http (>= 2.0, < 3.5) - json - logger - faraday-multipart (1.1.1) - multipart-post (~> 2.0) - faraday-net_http (3.4.1) - net-http (>= 0.5.0) - faraday-retry (2.3.2) - faraday (~> 2.0) - ferrum (0.17.1) - addressable (~> 2.5) - base64 (~> 0.2) - concurrent-ruby (~> 1.1) - webrick (~> 1.7) - websocket-driver (~> 0.7) - ffi (1.17.2-x86_64-linux-gnu) - fiber-annotation (0.2.0) - fiber-local (1.1.0) - fiber-storage - fiber-storage (1.0.1) - flay (2.13.3) - erubi (~> 1.10) - path_expander (~> 1.0) - ruby_parser (~> 3.0) - sexp_processor (~> 4.0) - globalid (1.3.0) - activesupport (>= 6.1) - google-cloud-env (2.3.1) - base64 (~> 0.2) - faraday (>= 1.0, < 3.a) - google-logging-utils (0.2.0) - googleauth (1.15.1) - faraday (>= 1.0, < 3.a) - google-cloud-env (~> 2.2) - google-logging-utils (~> 0.1) - jwt (>= 1.4, < 4.0) - multi_json (~> 1.11) - os (>= 0.9, < 2.0) - signet (>= 0.16, < 2.a) - hashdiff (1.2.1) - i18n (1.14.7) - concurrent-ruby (~> 1.0) - image_processing (1.14.0) - mini_magick (>= 4.9.5, < 6) - ruby-vips (>= 2.0.17, < 3) - iniparse (1.5.0) - io-console (0.8.1) - io-event (1.14.0) - irb (1.15.2) - pp (>= 0.6.0) - rdoc (>= 4.0.0) - reline (>= 0.4.2) - json (2.15.1) - json-schema (6.0.0) - addressable (~> 2.8) - bigdecimal (~> 3.1) - jwt (3.1.2) - base64 - language_server-protocol (3.17.0.5) - lint_roller (1.1.0) - logger (1.7.0) - loofah (2.24.1) - crass (~> 1.0.2) - nokogiri (>= 1.12.0) - mail (2.8.1) - mini_mime (>= 0.1.1) - net-imap - net-pop - net-smtp - marcel (1.1.0) - method_source (1.1.0) - metrics (0.15.0) - mini_magick (5.3.1) - logger - mini_mime (1.1.5) - minitest (5.26.0) - multi_json (1.17.0) - multipart-post (2.4.1) - mutex_m (0.3.0) - net-http (0.6.0) - uri - net-imap (0.5.12) - date - net-protocol - net-pop (0.1.2) - net-protocol - net-protocol (0.2.2) - timeout - net-smtp (0.5.1) - net-protocol - nio4r (2.7.4) - nokogiri (1.18.10-x86_64-linux-gnu) - racc (~> 1.4) - os (1.1.4) - overcommit (0.68.0) - childprocess (>= 0.6.3, < 6) - iniparse (~> 1.4) - rexml (>= 3.3.9) - parallel (1.27.0) - parser (3.3.9.0) - ast (~> 2.4.1) - racc - path_expander (1.1.3) - pp (0.6.3) - prettyprint - prettyprint (0.2.0) - prism (1.6.0) - pry (0.15.2) - coderay (~> 1.1) - method_source (~> 1.0) - psych (5.2.6) - date - stringio - public_suffix (6.0.2) - racc (1.8.1) - rack (3.2.3) - rack-session (2.1.1) - base64 (>= 0.1.0) - rack (>= 3.0.0) - rack-test (2.2.0) - rack (>= 1.3) - rackup (2.2.1) - rack (>= 3) - rails (7.1.5.2) - actioncable (= 7.1.5.2) - actionmailbox (= 7.1.5.2) - actionmailer (= 7.1.5.2) - actionpack (= 7.1.5.2) - actiontext (= 7.1.5.2) - actionview (= 7.1.5.2) - activejob (= 7.1.5.2) - activemodel (= 7.1.5.2) - activerecord (= 7.1.5.2) - activestorage (= 7.1.5.2) - activesupport (= 7.1.5.2) - bundler (>= 1.15.0) - railties (= 7.1.5.2) - rails-dom-testing (2.3.0) - activesupport (>= 5.0.0) - minitest - nokogiri (>= 1.6) - rails-html-sanitizer (1.6.2) - loofah (~> 2.21) - nokogiri (>= 1.15.7, != 1.16.7, != 1.16.6, != 1.16.5, != 1.16.4, != 1.16.3, != 1.16.2, != 1.16.1, != 1.16.0.rc1, != 1.16.0) - railties (7.1.5.2) - actionpack (= 7.1.5.2) - activesupport (= 7.1.5.2) - irb - rackup (>= 1.0.0) - rake (>= 12.2) - thor (~> 1.0, >= 1.2.2) - zeitwerk (~> 2.6) - rainbow (3.1.1) - rake (13.3.0) - rdoc (6.15.0) - erb - psych (>= 4.0.0) - tsort - regexp_parser (2.11.3) - reline (0.6.2) - io-console (~> 0.5) - rexml (3.4.4) - rspec (3.13.2) - rspec-core (~> 3.13.0) - rspec-expectations (~> 3.13.0) - rspec-mocks (~> 3.13.0) - rspec-core (3.13.6) - rspec-support (~> 3.13.0) - rspec-expectations (3.13.5) - diff-lcs (>= 1.2.0, < 2.0) - rspec-support (~> 3.13.0) - rspec-mocks (3.13.6) - diff-lcs (>= 1.2.0, < 2.0) - rspec-support (~> 3.13.0) - rspec-support (3.13.6) - rubocop (1.81.6) - json (~> 2.3) - language_server-protocol (~> 3.17.0.2) - lint_roller (~> 1.1.0) - parallel (~> 1.10) - parser (>= 3.3.0.2) - rainbow (>= 2.2.2, < 4.0) - regexp_parser (>= 2.9.3, < 3.0) - rubocop-ast (>= 1.47.1, < 2.0) - ruby-progressbar (~> 1.7) - unicode-display_width (>= 2.4.0, < 4.0) - rubocop-ast (1.47.1) - parser (>= 3.3.7.2) - prism (~> 1.4) - rubocop-performance (1.26.1) - lint_roller (~> 1.1) - rubocop (>= 1.75.0, < 2.0) - rubocop-ast (>= 1.47.1, < 2.0) - rubocop-rake (0.7.1) - lint_roller (~> 1.1) - rubocop (>= 1.72.1) - rubocop-rspec (3.7.0) - lint_roller (~> 1.1) - rubocop (~> 1.72, >= 1.72.1) - ruby-progressbar (1.13.0) - ruby-vips (2.2.5) - ffi (~> 1.12) - logger - ruby_llm-schema (0.1.9) - ruby_parser (3.21.1) - racc (~> 1.5) - sexp_processor (~> 4.16) - securerandom (0.4.1) - sexp_processor (4.17.4) - signet (0.21.0) - addressable (~> 2.8) - faraday (>= 0.17.5, < 3.a) - jwt (>= 1.5, < 4.0) - multi_json (~> 1.10) - simplecov (0.22.0) - docile (~> 1.1) - simplecov-html (~> 0.11) - simplecov_json_formatter (~> 0.1) - simplecov-cobertura (3.1.0) - rexml - simplecov (~> 0.19) - simplecov-html (0.13.2) - simplecov_json_formatter (0.1.4) - sqlite3 (2.7.4-x86_64-linux-gnu) - stringio (3.1.7) - thor (1.4.0) - timeout (0.4.3) - traces (0.18.2) - tsort (0.2.0) - tzinfo (2.0.6) - concurrent-ruby (~> 1.0) - unicode-display_width (3.2.0) - unicode-emoji (~> 4.1) - unicode-emoji (4.1.0) - uri (1.0.4) - vcr (6.3.1) - base64 - webmock (3.25.1) - addressable (>= 2.8.0) - crack (>= 0.3.2) - hashdiff (>= 0.4.0, < 2.0.0) - webrick (1.9.1) - websocket-driver (0.8.0) - base64 - websocket-extensions (>= 0.1.0) - websocket-extensions (0.1.5) - zeitwerk (2.7.3) - -PLATFORMS - x86_64-linux - -DEPENDENCIES - activerecord-jdbcsqlite3-adapter - appraisal - async - bundler (>= 2.0) - codecov - dotenv - ferrum - flay - googleauth - image_processing (~> 1.2) - irb - jdbc-sqlite3 - json-schema - nokogiri - overcommit (>= 0.66) - pry (>= 0.14) - rails (~> 7.1.0) - rake (>= 13.0) - reline - rspec (~> 3.12) - rubocop (>= 1.0) - rubocop-performance - rubocop-rake (>= 0.6) - rubocop-rspec - ruby_llm! - ruby_llm-schema (~> 0.1.0) - simplecov (>= 0.21) - simplecov-cobertura - sqlite3 - vcr - webmock (~> 3.18) - -BUNDLED WITH - 2.6.9 diff --git a/gemfiles/rails_7.2.gemfile.lock b/gemfiles/rails_7.2.gemfile.lock deleted file mode 100644 index 7f1f581a1..000000000 --- a/gemfiles/rails_7.2.gemfile.lock +++ /dev/null @@ -1,415 +0,0 @@ -PATH - remote: .. - specs: - ruby_llm (1.8.2) - base64 - event_stream_parser (~> 1) - faraday (>= 1.10.0) - faraday-multipart (>= 1) - faraday-net_http (>= 1) - faraday-retry (>= 1) - marcel (~> 1.0) - zeitwerk (~> 2) - -GEM - remote: https://rubygems.org/ - specs: - actioncable (7.2.2.2) - actionpack (= 7.2.2.2) - activesupport (= 7.2.2.2) - nio4r (~> 2.0) - websocket-driver (>= 0.6.1) - zeitwerk (~> 2.6) - actionmailbox (7.2.2.2) - actionpack (= 7.2.2.2) - activejob (= 7.2.2.2) - activerecord (= 7.2.2.2) - activestorage (= 7.2.2.2) - activesupport (= 7.2.2.2) - mail (>= 2.8.0) - actionmailer (7.2.2.2) - actionpack (= 7.2.2.2) - actionview (= 7.2.2.2) - activejob (= 7.2.2.2) - activesupport (= 7.2.2.2) - mail (>= 2.8.0) - rails-dom-testing (~> 2.2) - actionpack (7.2.2.2) - actionview (= 7.2.2.2) - activesupport (= 7.2.2.2) - nokogiri (>= 1.8.5) - racc - rack (>= 2.2.4, < 3.2) - rack-session (>= 1.0.1) - rack-test (>= 0.6.3) - rails-dom-testing (~> 2.2) - rails-html-sanitizer (~> 1.6) - useragent (~> 0.16) - actiontext (7.2.2.2) - actionpack (= 7.2.2.2) - activerecord (= 7.2.2.2) - activestorage (= 7.2.2.2) - activesupport (= 7.2.2.2) - globalid (>= 0.6.0) - nokogiri (>= 1.8.5) - actionview (7.2.2.2) - activesupport (= 7.2.2.2) - builder (~> 3.1) - erubi (~> 1.11) - rails-dom-testing (~> 2.2) - rails-html-sanitizer (~> 1.6) - activejob (7.2.2.2) - activesupport (= 7.2.2.2) - globalid (>= 0.3.6) - activemodel (7.2.2.2) - activesupport (= 7.2.2.2) - activerecord (7.2.2.2) - activemodel (= 7.2.2.2) - activesupport (= 7.2.2.2) - timeout (>= 0.4.0) - activestorage (7.2.2.2) - actionpack (= 7.2.2.2) - activejob (= 7.2.2.2) - activerecord (= 7.2.2.2) - activesupport (= 7.2.2.2) - marcel (~> 1.0) - activesupport (7.2.2.2) - base64 - benchmark (>= 0.3) - bigdecimal - concurrent-ruby (~> 1.0, >= 1.3.1) - connection_pool (>= 2.2.5) - drb - i18n (>= 1.6, < 2) - logger (>= 1.4.2) - minitest (>= 5.1) - securerandom (>= 0.3) - tzinfo (~> 2.0, >= 2.0.5) - addressable (2.8.7) - public_suffix (>= 2.0.2, < 7.0) - appraisal (2.5.0) - bundler - rake - thor (>= 0.14.0) - ast (2.4.3) - async (2.34.0) - console (~> 1.29) - fiber-annotation - io-event (~> 1.11) - metrics (~> 0.12) - traces (~> 0.18) - base64 (0.3.0) - benchmark (0.5.0) - bigdecimal (3.3.1) - builder (3.3.0) - childprocess (5.1.0) - logger (~> 1.5) - codecov (0.2.12) - json - simplecov - coderay (1.1.3) - concurrent-ruby (1.3.5) - connection_pool (2.5.4) - console (1.34.2) - fiber-annotation - fiber-local (~> 1.1) - json - crack (1.0.0) - bigdecimal - rexml - crass (1.0.6) - date (3.4.1) - diff-lcs (1.6.2) - docile (1.4.1) - dotenv (3.1.8) - drb (2.2.3) - erb (5.1.1) - erubi (1.13.1) - event_stream_parser (1.0.0) - faraday (2.14.0) - faraday-net_http (>= 2.0, < 3.5) - json - logger - faraday-multipart (1.1.1) - multipart-post (~> 2.0) - faraday-net_http (3.4.1) - net-http (>= 0.5.0) - faraday-retry (2.3.2) - faraday (~> 2.0) - ferrum (0.17.1) - addressable (~> 2.5) - base64 (~> 0.2) - concurrent-ruby (~> 1.1) - webrick (~> 1.7) - websocket-driver (~> 0.7) - ffi (1.17.2-x86_64-linux-gnu) - fiber-annotation (0.2.0) - fiber-local (1.1.0) - fiber-storage - fiber-storage (1.0.1) - flay (2.13.3) - erubi (~> 1.10) - path_expander (~> 1.0) - ruby_parser (~> 3.0) - sexp_processor (~> 4.0) - globalid (1.3.0) - activesupport (>= 6.1) - google-cloud-env (2.3.1) - base64 (~> 0.2) - faraday (>= 1.0, < 3.a) - google-logging-utils (0.2.0) - googleauth (1.15.1) - faraday (>= 1.0, < 3.a) - google-cloud-env (~> 2.2) - google-logging-utils (~> 0.1) - jwt (>= 1.4, < 4.0) - multi_json (~> 1.11) - os (>= 0.9, < 2.0) - signet (>= 0.16, < 2.a) - hashdiff (1.2.1) - i18n (1.14.7) - concurrent-ruby (~> 1.0) - image_processing (1.14.0) - mini_magick (>= 4.9.5, < 6) - ruby-vips (>= 2.0.17, < 3) - iniparse (1.5.0) - io-console (0.8.1) - io-event (1.14.0) - irb (1.15.2) - pp (>= 0.6.0) - rdoc (>= 4.0.0) - reline (>= 0.4.2) - json (2.15.1) - json-schema (6.0.0) - addressable (~> 2.8) - bigdecimal (~> 3.1) - jwt (3.1.2) - base64 - language_server-protocol (3.17.0.5) - lint_roller (1.1.0) - logger (1.7.0) - loofah (2.24.1) - crass (~> 1.0.2) - nokogiri (>= 1.12.0) - mail (2.8.1) - mini_mime (>= 0.1.1) - net-imap - net-pop - net-smtp - marcel (1.1.0) - method_source (1.1.0) - metrics (0.15.0) - mini_magick (5.3.1) - logger - mini_mime (1.1.5) - minitest (5.26.0) - multi_json (1.17.0) - multipart-post (2.4.1) - net-http (0.6.0) - uri - net-imap (0.5.12) - date - net-protocol - net-pop (0.1.2) - net-protocol - net-protocol (0.2.2) - timeout - net-smtp (0.5.1) - net-protocol - nio4r (2.7.4) - nokogiri (1.18.10-x86_64-linux-gnu) - racc (~> 1.4) - os (1.1.4) - overcommit (0.68.0) - childprocess (>= 0.6.3, < 6) - iniparse (~> 1.4) - rexml (>= 3.3.9) - parallel (1.27.0) - parser (3.3.9.0) - ast (~> 2.4.1) - racc - path_expander (1.1.3) - pp (0.6.3) - prettyprint - prettyprint (0.2.0) - prism (1.6.0) - pry (0.15.2) - coderay (~> 1.1) - method_source (~> 1.0) - psych (5.2.6) - date - stringio - public_suffix (6.0.2) - racc (1.8.1) - rack (3.1.18) - rack-session (2.1.1) - base64 (>= 0.1.0) - rack (>= 3.0.0) - rack-test (2.2.0) - rack (>= 1.3) - rackup (2.2.1) - rack (>= 3) - rails (7.2.2.2) - actioncable (= 7.2.2.2) - actionmailbox (= 7.2.2.2) - actionmailer (= 7.2.2.2) - actionpack (= 7.2.2.2) - actiontext (= 7.2.2.2) - actionview (= 7.2.2.2) - activejob (= 7.2.2.2) - activemodel (= 7.2.2.2) - activerecord (= 7.2.2.2) - activestorage (= 7.2.2.2) - activesupport (= 7.2.2.2) - bundler (>= 1.15.0) - railties (= 7.2.2.2) - rails-dom-testing (2.3.0) - activesupport (>= 5.0.0) - minitest - nokogiri (>= 1.6) - rails-html-sanitizer (1.6.2) - loofah (~> 2.21) - nokogiri (>= 1.15.7, != 1.16.7, != 1.16.6, != 1.16.5, != 1.16.4, != 1.16.3, != 1.16.2, != 1.16.1, != 1.16.0.rc1, != 1.16.0) - railties (7.2.2.2) - actionpack (= 7.2.2.2) - activesupport (= 7.2.2.2) - irb (~> 1.13) - rackup (>= 1.0.0) - rake (>= 12.2) - thor (~> 1.0, >= 1.2.2) - zeitwerk (~> 2.6) - rainbow (3.1.1) - rake (13.3.0) - rdoc (6.15.0) - erb - psych (>= 4.0.0) - tsort - regexp_parser (2.11.3) - reline (0.6.2) - io-console (~> 0.5) - rexml (3.4.4) - rspec (3.13.2) - rspec-core (~> 3.13.0) - rspec-expectations (~> 3.13.0) - rspec-mocks (~> 3.13.0) - rspec-core (3.13.6) - rspec-support (~> 3.13.0) - rspec-expectations (3.13.5) - diff-lcs (>= 1.2.0, < 2.0) - rspec-support (~> 3.13.0) - rspec-mocks (3.13.6) - diff-lcs (>= 1.2.0, < 2.0) - rspec-support (~> 3.13.0) - rspec-support (3.13.6) - rubocop (1.81.6) - json (~> 2.3) - language_server-protocol (~> 3.17.0.2) - lint_roller (~> 1.1.0) - parallel (~> 1.10) - parser (>= 3.3.0.2) - rainbow (>= 2.2.2, < 4.0) - regexp_parser (>= 2.9.3, < 3.0) - rubocop-ast (>= 1.47.1, < 2.0) - ruby-progressbar (~> 1.7) - unicode-display_width (>= 2.4.0, < 4.0) - rubocop-ast (1.47.1) - parser (>= 3.3.7.2) - prism (~> 1.4) - rubocop-performance (1.26.1) - lint_roller (~> 1.1) - rubocop (>= 1.75.0, < 2.0) - rubocop-ast (>= 1.47.1, < 2.0) - rubocop-rake (0.7.1) - lint_roller (~> 1.1) - rubocop (>= 1.72.1) - rubocop-rspec (3.7.0) - lint_roller (~> 1.1) - rubocop (~> 1.72, >= 1.72.1) - ruby-progressbar (1.13.0) - ruby-vips (2.2.5) - ffi (~> 1.12) - logger - ruby_llm-schema (0.1.9) - ruby_parser (3.21.1) - racc (~> 1.5) - sexp_processor (~> 4.16) - securerandom (0.4.1) - sexp_processor (4.17.4) - signet (0.21.0) - addressable (~> 2.8) - faraday (>= 0.17.5, < 3.a) - jwt (>= 1.5, < 4.0) - multi_json (~> 1.10) - simplecov (0.22.0) - docile (~> 1.1) - simplecov-html (~> 0.11) - simplecov_json_formatter (~> 0.1) - simplecov-cobertura (3.1.0) - rexml - simplecov (~> 0.19) - simplecov-html (0.13.2) - simplecov_json_formatter (0.1.4) - sqlite3 (2.7.4-x86_64-linux-gnu) - stringio (3.1.7) - thor (1.4.0) - timeout (0.4.3) - traces (0.18.2) - tsort (0.2.0) - tzinfo (2.0.6) - concurrent-ruby (~> 1.0) - unicode-display_width (3.2.0) - unicode-emoji (~> 4.1) - unicode-emoji (4.1.0) - uri (1.0.4) - useragent (0.16.11) - vcr (6.3.1) - base64 - webmock (3.25.1) - addressable (>= 2.8.0) - crack (>= 0.3.2) - hashdiff (>= 0.4.0, < 2.0.0) - webrick (1.9.1) - websocket-driver (0.8.0) - base64 - websocket-extensions (>= 0.1.0) - websocket-extensions (0.1.5) - zeitwerk (2.7.3) - -PLATFORMS - x86_64-linux - -DEPENDENCIES - activerecord-jdbcsqlite3-adapter - appraisal - async - bundler (>= 2.0) - codecov - dotenv - ferrum - flay - googleauth - image_processing (~> 1.2) - irb - jdbc-sqlite3 - json-schema - nokogiri - overcommit (>= 0.66) - pry (>= 0.14) - rails (~> 7.2.0) - rake (>= 13.0) - reline - rspec (~> 3.12) - rubocop (>= 1.0) - rubocop-performance - rubocop-rake (>= 0.6) - rubocop-rspec - ruby_llm! - ruby_llm-schema (~> 0.1.0) - simplecov (>= 0.21) - simplecov-cobertura - sqlite3 - vcr - webmock (~> 3.18) - -BUNDLED WITH - 2.6.9 diff --git a/gemfiles/rails_8.0.gemfile.lock b/gemfiles/rails_8.0.gemfile.lock deleted file mode 100644 index 40a66c3ee..000000000 --- a/gemfiles/rails_8.0.gemfile.lock +++ /dev/null @@ -1,416 +0,0 @@ -PATH - remote: .. - specs: - ruby_llm (1.8.2) - base64 - event_stream_parser (~> 1) - faraday (>= 1.10.0) - faraday-multipart (>= 1) - faraday-net_http (>= 1) - faraday-retry (>= 1) - marcel (~> 1.0) - zeitwerk (~> 2) - -GEM - remote: https://rubygems.org/ - specs: - actioncable (8.0.3) - actionpack (= 8.0.3) - activesupport (= 8.0.3) - nio4r (~> 2.0) - websocket-driver (>= 0.6.1) - zeitwerk (~> 2.6) - actionmailbox (8.0.3) - actionpack (= 8.0.3) - activejob (= 8.0.3) - activerecord (= 8.0.3) - activestorage (= 8.0.3) - activesupport (= 8.0.3) - mail (>= 2.8.0) - actionmailer (8.0.3) - actionpack (= 8.0.3) - actionview (= 8.0.3) - activejob (= 8.0.3) - activesupport (= 8.0.3) - mail (>= 2.8.0) - rails-dom-testing (~> 2.2) - actionpack (8.0.3) - actionview (= 8.0.3) - activesupport (= 8.0.3) - nokogiri (>= 1.8.5) - rack (>= 2.2.4) - rack-session (>= 1.0.1) - rack-test (>= 0.6.3) - rails-dom-testing (~> 2.2) - rails-html-sanitizer (~> 1.6) - useragent (~> 0.16) - actiontext (8.0.3) - actionpack (= 8.0.3) - activerecord (= 8.0.3) - activestorage (= 8.0.3) - activesupport (= 8.0.3) - globalid (>= 0.6.0) - nokogiri (>= 1.8.5) - actionview (8.0.3) - activesupport (= 8.0.3) - builder (~> 3.1) - erubi (~> 1.11) - rails-dom-testing (~> 2.2) - rails-html-sanitizer (~> 1.6) - activejob (8.0.3) - activesupport (= 8.0.3) - globalid (>= 0.3.6) - activemodel (8.0.3) - activesupport (= 8.0.3) - activerecord (8.0.3) - activemodel (= 8.0.3) - activesupport (= 8.0.3) - timeout (>= 0.4.0) - activestorage (8.0.3) - actionpack (= 8.0.3) - activejob (= 8.0.3) - activerecord (= 8.0.3) - activesupport (= 8.0.3) - marcel (~> 1.0) - activesupport (8.0.3) - base64 - benchmark (>= 0.3) - bigdecimal - concurrent-ruby (~> 1.0, >= 1.3.1) - connection_pool (>= 2.2.5) - drb - i18n (>= 1.6, < 2) - logger (>= 1.4.2) - minitest (>= 5.1) - securerandom (>= 0.3) - tzinfo (~> 2.0, >= 2.0.5) - uri (>= 0.13.1) - addressable (2.8.7) - public_suffix (>= 2.0.2, < 7.0) - appraisal (2.5.0) - bundler - rake - thor (>= 0.14.0) - ast (2.4.3) - async (2.34.0) - console (~> 1.29) - fiber-annotation - io-event (~> 1.11) - metrics (~> 0.12) - traces (~> 0.18) - base64 (0.3.0) - benchmark (0.5.0) - bigdecimal (3.3.1) - builder (3.3.0) - childprocess (5.1.0) - logger (~> 1.5) - codecov (0.2.12) - json - simplecov - coderay (1.1.3) - concurrent-ruby (1.3.5) - connection_pool (2.5.4) - console (1.34.2) - fiber-annotation - fiber-local (~> 1.1) - json - crack (1.0.0) - bigdecimal - rexml - crass (1.0.6) - date (3.4.1) - diff-lcs (1.6.2) - docile (1.4.1) - dotenv (3.1.8) - drb (2.2.3) - erb (5.1.1) - erubi (1.13.1) - event_stream_parser (1.0.0) - faraday (2.14.0) - faraday-net_http (>= 2.0, < 3.5) - json - logger - faraday-multipart (1.1.1) - multipart-post (~> 2.0) - faraday-net_http (3.4.1) - net-http (>= 0.5.0) - faraday-retry (2.3.2) - faraday (~> 2.0) - ferrum (0.17.1) - addressable (~> 2.5) - base64 (~> 0.2) - concurrent-ruby (~> 1.1) - webrick (~> 1.7) - websocket-driver (~> 0.7) - ffi (1.17.2-x86_64-linux-gnu) - fiber-annotation (0.2.0) - fiber-local (1.1.0) - fiber-storage - fiber-storage (1.0.1) - flay (2.13.3) - erubi (~> 1.10) - path_expander (~> 1.0) - ruby_parser (~> 3.0) - sexp_processor (~> 4.0) - globalid (1.3.0) - activesupport (>= 6.1) - google-cloud-env (2.3.1) - base64 (~> 0.2) - faraday (>= 1.0, < 3.a) - google-logging-utils (0.2.0) - googleauth (1.15.1) - faraday (>= 1.0, < 3.a) - google-cloud-env (~> 2.2) - google-logging-utils (~> 0.1) - jwt (>= 1.4, < 4.0) - multi_json (~> 1.11) - os (>= 0.9, < 2.0) - signet (>= 0.16, < 2.a) - hashdiff (1.2.1) - i18n (1.14.7) - concurrent-ruby (~> 1.0) - image_processing (1.14.0) - mini_magick (>= 4.9.5, < 6) - ruby-vips (>= 2.0.17, < 3) - iniparse (1.5.0) - io-console (0.8.1) - io-event (1.14.0) - irb (1.15.2) - pp (>= 0.6.0) - rdoc (>= 4.0.0) - reline (>= 0.4.2) - json (2.15.1) - json-schema (6.0.0) - addressable (~> 2.8) - bigdecimal (~> 3.1) - jwt (3.1.2) - base64 - language_server-protocol (3.17.0.5) - lint_roller (1.1.0) - logger (1.7.0) - loofah (2.24.1) - crass (~> 1.0.2) - nokogiri (>= 1.12.0) - mail (2.8.1) - mini_mime (>= 0.1.1) - net-imap - net-pop - net-smtp - marcel (1.1.0) - method_source (1.1.0) - metrics (0.15.0) - mini_magick (5.3.1) - logger - mini_mime (1.1.5) - minitest (5.26.0) - multi_json (1.17.0) - multipart-post (2.4.1) - net-http (0.6.0) - uri - net-imap (0.5.12) - date - net-protocol - net-pop (0.1.2) - net-protocol - net-protocol (0.2.2) - timeout - net-smtp (0.5.1) - net-protocol - nio4r (2.7.4) - nokogiri (1.18.10-x86_64-linux-gnu) - racc (~> 1.4) - os (1.1.4) - overcommit (0.68.0) - childprocess (>= 0.6.3, < 6) - iniparse (~> 1.4) - rexml (>= 3.3.9) - parallel (1.27.0) - parser (3.3.9.0) - ast (~> 2.4.1) - racc - path_expander (1.1.3) - pp (0.6.3) - prettyprint - prettyprint (0.2.0) - prism (1.6.0) - pry (0.15.2) - coderay (~> 1.1) - method_source (~> 1.0) - psych (5.2.6) - date - stringio - public_suffix (6.0.2) - racc (1.8.1) - rack (3.2.3) - rack-session (2.1.1) - base64 (>= 0.1.0) - rack (>= 3.0.0) - rack-test (2.2.0) - rack (>= 1.3) - rackup (2.2.1) - rack (>= 3) - rails (8.0.3) - actioncable (= 8.0.3) - actionmailbox (= 8.0.3) - actionmailer (= 8.0.3) - actionpack (= 8.0.3) - actiontext (= 8.0.3) - actionview (= 8.0.3) - activejob (= 8.0.3) - activemodel (= 8.0.3) - activerecord (= 8.0.3) - activestorage (= 8.0.3) - activesupport (= 8.0.3) - bundler (>= 1.15.0) - railties (= 8.0.3) - rails-dom-testing (2.3.0) - activesupport (>= 5.0.0) - minitest - nokogiri (>= 1.6) - rails-html-sanitizer (1.6.2) - loofah (~> 2.21) - nokogiri (>= 1.15.7, != 1.16.7, != 1.16.6, != 1.16.5, != 1.16.4, != 1.16.3, != 1.16.2, != 1.16.1, != 1.16.0.rc1, != 1.16.0) - railties (8.0.3) - actionpack (= 8.0.3) - activesupport (= 8.0.3) - irb (~> 1.13) - rackup (>= 1.0.0) - rake (>= 12.2) - thor (~> 1.0, >= 1.2.2) - tsort (>= 0.2) - zeitwerk (~> 2.6) - rainbow (3.1.1) - rake (13.3.0) - rdoc (6.15.0) - erb - psych (>= 4.0.0) - tsort - regexp_parser (2.11.3) - reline (0.6.2) - io-console (~> 0.5) - rexml (3.4.4) - rspec (3.13.2) - rspec-core (~> 3.13.0) - rspec-expectations (~> 3.13.0) - rspec-mocks (~> 3.13.0) - rspec-core (3.13.6) - rspec-support (~> 3.13.0) - rspec-expectations (3.13.5) - diff-lcs (>= 1.2.0, < 2.0) - rspec-support (~> 3.13.0) - rspec-mocks (3.13.6) - diff-lcs (>= 1.2.0, < 2.0) - rspec-support (~> 3.13.0) - rspec-support (3.13.6) - rubocop (1.81.6) - json (~> 2.3) - language_server-protocol (~> 3.17.0.2) - lint_roller (~> 1.1.0) - parallel (~> 1.10) - parser (>= 3.3.0.2) - rainbow (>= 2.2.2, < 4.0) - regexp_parser (>= 2.9.3, < 3.0) - rubocop-ast (>= 1.47.1, < 2.0) - ruby-progressbar (~> 1.7) - unicode-display_width (>= 2.4.0, < 4.0) - rubocop-ast (1.47.1) - parser (>= 3.3.7.2) - prism (~> 1.4) - rubocop-performance (1.26.1) - lint_roller (~> 1.1) - rubocop (>= 1.75.0, < 2.0) - rubocop-ast (>= 1.47.1, < 2.0) - rubocop-rake (0.7.1) - lint_roller (~> 1.1) - rubocop (>= 1.72.1) - rubocop-rspec (3.7.0) - lint_roller (~> 1.1) - rubocop (~> 1.72, >= 1.72.1) - ruby-progressbar (1.13.0) - ruby-vips (2.2.5) - ffi (~> 1.12) - logger - ruby_llm-schema (0.1.9) - ruby_parser (3.21.1) - racc (~> 1.5) - sexp_processor (~> 4.16) - securerandom (0.4.1) - sexp_processor (4.17.4) - signet (0.21.0) - addressable (~> 2.8) - faraday (>= 0.17.5, < 3.a) - jwt (>= 1.5, < 4.0) - multi_json (~> 1.10) - simplecov (0.22.0) - docile (~> 1.1) - simplecov-html (~> 0.11) - simplecov_json_formatter (~> 0.1) - simplecov-cobertura (3.1.0) - rexml - simplecov (~> 0.19) - simplecov-html (0.13.2) - simplecov_json_formatter (0.1.4) - sqlite3 (2.7.4-x86_64-linux-gnu) - stringio (3.1.7) - thor (1.4.0) - timeout (0.4.3) - traces (0.18.2) - tsort (0.2.0) - tzinfo (2.0.6) - concurrent-ruby (~> 1.0) - unicode-display_width (3.2.0) - unicode-emoji (~> 4.1) - unicode-emoji (4.1.0) - uri (1.0.4) - useragent (0.16.11) - vcr (6.3.1) - base64 - webmock (3.25.1) - addressable (>= 2.8.0) - crack (>= 0.3.2) - hashdiff (>= 0.4.0, < 2.0.0) - webrick (1.9.1) - websocket-driver (0.8.0) - base64 - websocket-extensions (>= 0.1.0) - websocket-extensions (0.1.5) - zeitwerk (2.7.3) - -PLATFORMS - x86_64-linux - -DEPENDENCIES - activerecord-jdbcsqlite3-adapter - appraisal - async - bundler (>= 2.0) - codecov - dotenv - ferrum - flay - googleauth - image_processing (~> 1.2) - irb - jdbc-sqlite3 - json-schema - nokogiri - overcommit (>= 0.66) - pry (>= 0.14) - rails (~> 8.0.0) - rake (>= 13.0) - reline - rspec (~> 3.12) - rubocop (>= 1.0) - rubocop-performance - rubocop-rake (>= 0.6) - rubocop-rspec - ruby_llm! - ruby_llm-schema (~> 0.1.0) - simplecov (>= 0.21) - simplecov-cobertura - sqlite3 - vcr - webmock (~> 3.18) - -BUNDLED WITH - 2.6.9 diff --git a/gemfiles/rails_8.1.gemfile b/gemfiles/rails_8.1.gemfile new file mode 100644 index 000000000..8ac2bacaf --- /dev/null +++ b/gemfiles/rails_8.1.gemfile @@ -0,0 +1,38 @@ +# This file was generated by Appraisal + +source "https://rubygems.org" + +group :development do + gem "appraisal" + gem "async", platform: :mri + gem "bundler", ">= 2.0" + gem "codecov" + gem "dotenv" + gem "ferrum" + gem "flay" + gem "image_processing", "~> 1.2" + gem "irb" + gem "json-schema" + gem "nokogiri" + gem "overcommit", ">= 0.66" + gem "pry", ">= 0.14" + gem "rails", "~> 8.1.0" + gem "rake", ">= 13.0" + gem "reline" + gem "rspec", "~> 3.12" + gem "rubocop", ">= 1.0" + gem "rubocop-performance" + gem "rubocop-rake", ">= 0.6" + gem "rubocop-rspec" + gem "ruby_llm-schema", "~> 0.1.0" + gem "simplecov", ">= 0.21" + gem "simplecov-cobertura" + gem "activerecord-jdbcsqlite3-adapter", platform: "jruby" + gem "jdbc-sqlite3", platform: "jruby" + gem "sqlite3", platform: "mri" + gem "vcr" + gem "webmock", "~> 3.18" + gem "googleauth" +end + +gemspec path: "../" diff --git a/lib/ruby_llm.rb b/lib/ruby_llm.rb index dfe1c5cd3..18dedac4a 100644 --- a/lib/ruby_llm.rb +++ b/lib/ruby_llm.rb @@ -58,6 +58,10 @@ def paint(...) Image.paint(...) end + def tts(...) + Speech.tts(...) + end + def transcribe(...) Transcription.transcribe(...) end diff --git a/lib/ruby_llm/active_record/model_methods.rb b/lib/ruby_llm/active_record/model_methods.rb index 4f8b935b7..8313b475a 100644 --- a/lib/ruby_llm/active_record/model_methods.rb +++ b/lib/ruby_llm/active_record/model_methods.rb @@ -77,7 +77,7 @@ def to_llm delegate :supports?, :supports_vision?, :supports_functions?, :type, :input_price_per_million, :output_price_per_million, :function_calling?, :structured_output?, :batch?, - :reasoning?, :citations?, :streaming?, + :reasoning?, :citations?, :streaming?, :provider_class, to: :to_llm end end diff --git a/lib/ruby_llm/attachment.rb b/lib/ruby_llm/attachment.rb index ea3aab8ef..1e4a8e931 100644 --- a/lib/ruby_llm/attachment.rb +++ b/lib/ruby_llm/attachment.rb @@ -65,6 +65,14 @@ def encoded Base64.strict_encode64(content) end + def save(path) + return unless io_like? + + File.open(path, 'w') do |f| + f.puts(@source.read) + end + end + def for_llm case type when :text diff --git a/lib/ruby_llm/configuration.rb b/lib/ruby_llm/configuration.rb index e1c12902a..b912618de 100644 --- a/lib/ruby_llm/configuration.rb +++ b/lib/ruby_llm/configuration.rb @@ -29,6 +29,7 @@ class Configuration :default_embedding_model, :default_moderation_model, :default_image_model, + :default_audio_model, :default_transcription_model, # Model registry :model_registry_file, @@ -60,6 +61,7 @@ def initialize @default_embedding_model = 'text-embedding-3-small' @default_moderation_model = 'omni-moderation-latest' @default_image_model = 'gpt-image-1' + @default_audio_model = 'gpt-4o-mini-tts' @default_transcription_model = 'whisper-1' @model_registry_file = File.expand_path('models.json', __dir__) diff --git a/lib/ruby_llm/connection.rb b/lib/ruby_llm/connection.rb index 8db883c89..5c9245988 100644 --- a/lib/ruby_llm/connection.rb +++ b/lib/ruby_llm/connection.rb @@ -65,7 +65,7 @@ def setup_logging(faraday) errors: true, headers: false, log_level: :debug do |logger| - logger.filter(%r{[A-Za-z0-9+/=]{100,}}, 'data":"[BASE64 DATA]"') + logger.filter(%r{[A-Za-z0-9+/=]{100,}}, '[BASE64 DATA]') logger.filter(/[-\d.e,\s]{100,}/, '[EMBEDDINGS ARRAY]') end end diff --git a/lib/ruby_llm/model/info.rb b/lib/ruby_llm/model/info.rb index e234e8b17..10706e67c 100644 --- a/lib/ruby_llm/model/info.rb +++ b/lib/ruby_llm/model/info.rb @@ -72,6 +72,10 @@ def output_price_per_million pricing.text_tokens.output end + def provider_class + RubyLLM::Provider.resolve provider + end + def type # rubocop:disable Metrics/PerceivedComplexity if modalities.output.include?('embeddings') && !modalities.output.include?('text') 'embedding' diff --git a/lib/ruby_llm/provider.rb b/lib/ruby_llm/provider.rb index 025e91ab0..a4bbe51e0 100644 --- a/lib/ruby_llm/provider.rb +++ b/lib/ruby_llm/provider.rb @@ -82,6 +82,12 @@ def paint(prompt, model:, size:) parse_image_response(response, model:) end + def tts(input, model:, voice:) + payload = render_speech_payload(input, model:, voice:) + response = @connection.post speech_url, payload + parse_speech_response(response, model:) + end + def transcribe(audio_file, model:, language:, **options) file_part = build_audio_file_part(audio_file) payload = render_transcription_payload(file_part, model:, language:, **options) @@ -167,9 +173,13 @@ def register(name, provider_class) providers[name.to_sym] = provider_class end + def resolve(name) + providers[name.to_sym] + end + def for(model) model_info = Models.find(model) - providers[model_info.provider.to_sym] + resolve model_info.provider end def providers diff --git a/lib/ruby_llm/providers/gemini.rb b/lib/ruby_llm/providers/gemini.rb index 0bba3e275..b3c8f8629 100644 --- a/lib/ruby_llm/providers/gemini.rb +++ b/lib/ruby_llm/providers/gemini.rb @@ -7,6 +7,7 @@ class Gemini < Provider include Gemini::Chat include Gemini::Embeddings include Gemini::Images + include Gemini::Speech include Gemini::Models include Gemini::Transcription include Gemini::Streaming diff --git a/lib/ruby_llm/providers/gemini/chat.rb b/lib/ruby_llm/providers/gemini/chat.rb index 101a70c96..3233ce939 100644 --- a/lib/ruby_llm/providers/gemini/chat.rb +++ b/lib/ruby_llm/providers/gemini/chat.rb @@ -50,22 +50,9 @@ def format_role(role) def format_parts(msg) if msg.tool_call? - [{ - functionCall: { - name: msg.tool_calls.values.first.name, - args: msg.tool_calls.values.first.arguments - } - }] + format_tool_call(msg) elsif msg.tool_result? - [{ - functionResponse: { - name: msg.tool_call_id, - response: { - name: msg.tool_call_id, - content: Media.format_content(msg.content) - } - } - }] + format_tool_result(msg) else Media.format_content(msg.content) end @@ -77,7 +64,7 @@ def parse_completion_response(response) Message.new( role: :assistant, - content: extract_content(data), + content: parse_content(data), tool_calls: tool_calls, input_tokens: data.dig('usageMetadata', 'promptTokenCount'), output_tokens: calculate_output_tokens(data), @@ -109,17 +96,16 @@ def normalize_any_of(schema) { type: 'string', nullable: true } end - def extract_content(data) + def parse_content(data) candidate = data.dig('candidates', 0) return '' unless candidate return '' if function_call?(candidate) parts = candidate.dig('content', 'parts') - text_parts = parts&.select { |p| p['text'] } - return '' unless text_parts&.any? + return '' unless parts&.any? - text_parts.map { |p| p['text'] }.join + build_response_content(parts) end def function_call?(candidate) diff --git a/lib/ruby_llm/providers/gemini/media.rb b/lib/ruby_llm/providers/gemini/media.rb index 854d042c5..6a7dcbae3 100644 --- a/lib/ruby_llm/providers/gemini/media.rb +++ b/lib/ruby_llm/providers/gemini/media.rb @@ -2,7 +2,7 @@ module RubyLLM module Providers - class Gemini + class Gemini # rubocop:disable Style/Documentation # Media handling methods for the Gemini API integration module Media module_function @@ -50,6 +50,63 @@ def format_text(text) } end end + + def build_response_content(parts) # rubocop:disable Metrics/PerceivedComplexity + text = [] + attachments = [] + + parts.each_with_index do |part, index| + if part['text'] + text << part['text'] + elsif part['inlineData'] + attachment = build_inline_attachment(part['inlineData'], index) + attachments << attachment if attachment + elsif part['fileData'] + attachment = build_file_attachment(part['fileData'], index) + attachments << attachment if attachment + end + end + + text = text.join + text = nil if text.empty? + return text if attachments.empty? + + Content.new(text:, attachments:) + end + + def build_inline_attachment(inline_data, index) + encoded = inline_data['data'] + return unless encoded + + mime_type = inline_data['mimeType'] + decoded = Base64.decode64(encoded) + io = StringIO.new(decoded) + io.set_encoding(Encoding::BINARY) if io.respond_to?(:set_encoding) + + filename = attachment_filename(mime_type, index) + RubyLLM::Attachment.new(io, filename:) + rescue ArgumentError => e + RubyLLM.logger.warn "Failed to decode Gemini inline data attachment: #{e.message}" + nil + end + + def build_file_attachment(file_data, index) + uri = file_data['fileUri'] + return unless uri + + filename = file_data['filename'] || attachment_filename(file_data['mimeType'], index) + RubyLLM::Attachment.new(uri, filename:) + end + + def attachment_filename(mime_type, index) + return "gemini_attachment_#{index + 1}" unless mime_type + + extension = mime_type.split('/').last.to_s + extension = 'jpg' if extension == 'jpeg' + extension = 'txt' if extension == 'plain' + extension = extension.tr('+', '.') + "gemini_attachment_#{index + 1}.#{extension}" + end end end end diff --git a/lib/ruby_llm/providers/gemini/speech.rb b/lib/ruby_llm/providers/gemini/speech.rb new file mode 100644 index 000000000..167a90afc --- /dev/null +++ b/lib/ruby_llm/providers/gemini/speech.rb @@ -0,0 +1,47 @@ +# frozen_string_literal: true + +module RubyLLM + module Providers + class Gemini + # Speech generation methods for the Gemini API integration + module Speech + module_function + + def speech_url + "models/#{@model}:generateContent" + end + + def render_speech_payload(input, model:, voice:) + @model = model + { + contents: [{ + role: 'user', + parts: [{ text: input }] + }], + generationConfig: { + responseModalities: ['AUDIO'], + speechConfig: { + voiceConfig: { + prebuiltVoiceConfig: { + voiceName: voice + } + } + } + }, + model: model + } + end + + def parse_speech_response(response, model:) + base64_audio = response.body['candidates'][0]['content']['parts'][0]['inlineData']['data'] + pcm_data = Base64.decode64(base64_audio) + + RubyLLM::Speech.new( + model: model, + data: pcm_data + ) + end + end + end + end +end diff --git a/lib/ruby_llm/providers/gemini/tools.rb b/lib/ruby_llm/providers/gemini/tools.rb index 63f095f08..9ec0e084c 100644 --- a/lib/ruby_llm/providers/gemini/tools.rb +++ b/lib/ruby_llm/providers/gemini/tools.rb @@ -13,6 +13,27 @@ def format_tools(tools) }] end + def format_tool_call(msg) + [{ + functionCall: { + name: msg.tool_calls.values.first.name, + args: msg.tool_calls.values.first.arguments + } + }] + end + + def format_tool_result(msg) + [{ + functionResponse: { + name: msg.tool_call_id, + response: { + name: msg.tool_call_id, + content: Media.format_content(msg.content) + } + } + }] + end + def extract_tool_calls(data) return nil unless data diff --git a/lib/ruby_llm/providers/openai.rb b/lib/ruby_llm/providers/openai.rb index e8a07bc4c..c2257954d 100644 --- a/lib/ruby_llm/providers/openai.rb +++ b/lib/ruby_llm/providers/openai.rb @@ -11,6 +11,7 @@ class OpenAI < Provider include OpenAI::Streaming include OpenAI::Tools include OpenAI::Images + include OpenAI::Speech include OpenAI::Media include OpenAI::Transcription diff --git a/lib/ruby_llm/providers/openai/speech.rb b/lib/ruby_llm/providers/openai/speech.rb new file mode 100644 index 000000000..0a4079568 --- /dev/null +++ b/lib/ruby_llm/providers/openai/speech.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +module RubyLLM + module Providers + class OpenAI + # Speech generation methods for the OpenAI API integration + module Speech + module_function + + def speech_url + 'audio/speech' + end + + def render_speech_payload(input, model:, voice:) + { + model: model, + input: input, + voice: voice + } + end + + def parse_speech_response(response, model:) + data = response.body + RubyLLM::Speech.new( + model: model, + data: data + ) + end + end + end + end +end diff --git a/lib/ruby_llm/speech.rb b/lib/ruby_llm/speech.rb new file mode 100644 index 000000000..59ca23a3f --- /dev/null +++ b/lib/ruby_llm/speech.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +module RubyLLM + # Represents a generated image from an AI model. + class Speech + attr_reader :model, :data + + def initialize(data:, model: nil) + @model = model + @data = data + end + + def save(path) + File.binwrite(File.expand_path(path), data) + path + end + + def self.tts(input, # rubocop:disable Metrics/ParameterLists + model: nil, + provider: nil, + assume_model_exists: false, + voice: 'alloy', + context: nil) + config = context&.config || RubyLLM.config + model ||= config.default_audio_model + model, provider_instance = Models.resolve(model, provider: provider, assume_exists: assume_model_exists, + config: config) + + provider_instance.tts(input, model: model.id, voice:) + end + end +end diff --git a/spec/fixtures/vcr_cassettes/speech_basic_functionality_gemini_gemini-2_5-flash-preview-tts_can_generate_audio_from_text.yml b/spec/fixtures/vcr_cassettes/speech_basic_functionality_gemini_gemini-2_5-flash-preview-tts_can_generate_audio_from_text.yml new file mode 100644 index 000000000..de98f16b2 --- /dev/null +++ b/spec/fixtures/vcr_cassettes/speech_basic_functionality_gemini_gemini-2_5-flash-preview-tts_can_generate_audio_from_text.yml @@ -0,0 +1,89 @@ +--- +http_interactions: +- request: + method: post + uri: https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent + body: + encoding: UTF-8 + string: '{"contents":[{"role":"user","parts":[{"text":"Hello, welcome!"}]}],"generationConfig":{"responseModalities":["AUDIO"],"speechConfig":{"voiceConfig":{"prebuiltVoiceConfig":{"voiceName":"Sadachbia"}}}},"model":"gemini-2.5-flash-preview-tts"}' + headers: + User-Agent: + - Faraday v2.14.0 + X-Goog-Api-Key: + - "" + Content-Type: + - application/json + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + response: + status: + code: 200 + message: OK + headers: + Content-Type: + - application/json; charset=UTF-8 + Vary: + - Origin + - Referer + - X-Origin + Date: + - Wed, 29 Oct 2025 08:34:53 GMT + Server: + - scaffolding on HTTPServer2 + X-Xss-Protection: + - '0' + X-Frame-Options: + - SAMEORIGIN + X-Content-Type-Options: + - nosniff + Server-Timing: + - gfet4t7; dur=2463 + Alt-Svc: + - h3=":443"; ma=2592000,h3-29=":443"; ma=2592000 + Transfer-Encoding: + - chunked + body: + encoding: ASCII-8BIT + string: | + { + "candidates": [ + { + "content": { + "parts": [ + { + "inlineData": { + "mimeType": "audio/L16;codec=pcm;rate=24000", + "data": "" + } + } + ], + "role": "model" + }, + "finishReason": "STOP", + "index": 0 + } + ], + "usageMetadata": { + "promptTokenCount": 5, + "candidatesTokenCount": 48, + "totalTokenCount": 53, + "promptTokensDetails": [ + { + "modality": "TEXT", + "tokenCount": 5 + } + ], + "candidatesTokensDetails": [ + { + "modality": "AUDIO", + "tokenCount": 48 + } + ] + }, + "modelVersion": "gemini-2.5-flash-preview-tts", + "responseId": "rdEBacmoMKSUvdIPw57pqAQ" + } + recorded_at: Wed, 29 Oct 2025 08:34:53 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/fixtures/vcr_cassettes/speech_basic_functionality_openai_gpt-4o-mini-tts_can_generate_audio_from_text.yml b/spec/fixtures/vcr_cassettes/speech_basic_functionality_openai_gpt-4o-mini-tts_can_generate_audio_from_text.yml new file mode 100644 index 000000000..63a349835 --- /dev/null +++ b/spec/fixtures/vcr_cassettes/speech_basic_functionality_openai_gpt-4o-mini-tts_can_generate_audio_from_text.yml @@ -0,0 +1,81 @@ +--- +http_interactions: +- request: + method: post + uri: https://api.openai.com/v1/audio/speech + body: + encoding: UTF-8 + string: '{"model":"gpt-4o-mini-tts","input":"Hello, welcome to RubyLLM!","voice":"alloy"}' + headers: + User-Agent: + - Faraday v2.14.0 + Authorization: + - Bearer + Content-Type: + - application/json + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + response: + status: + code: 200 + message: OK + headers: + Date: + - Tue, 28 Oct 2025 17:00:36 GMT + Content-Type: + - audio/mpeg + Transfer-Encoding: + - chunked + Connection: + - keep-alive + Access-Control-Expose-Headers: + - X-Request-ID + Openai-Organization: + - "" + Openai-Processing-Ms: + - '686' + Openai-Project: + - proj_vREE4lXbKbMWd0Fd7pcAzVjH + Openai-Version: + - '2020-10-01' + X-Envoy-Upstream-Service-Time: + - '1034' + X-Ratelimit-Limit-Requests: + - '10000' + X-Ratelimit-Limit-Tokens: + - '200000' + X-Ratelimit-Remaining-Requests: + - '9999' + X-Ratelimit-Remaining-Tokens: + - '199994' + X-Ratelimit-Reset-Requests: + - 8.64s + X-Ratelimit-Reset-Tokens: + - 1ms + X-Request-Id: + - "" + X-Openai-Proxy-Wasm: + - v0.1 + Cf-Cache-Status: + - DYNAMIC + Set-Cookie: + - "" + - "" + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + X-Content-Type-Options: + - nosniff + Server: + - cloudflare + Cf-Ray: + - "" + Alt-Svc: + - h3=":443"; ma=86400 + body: + encoding: ASCII-8BIT + string: !binary |- +  + recorded_at: Tue, 28 Oct 2025 17:00:36 GMT +recorded_with: VCR 6.3.1 diff --git a/spec/ruby_llm/speech_spec.rb b/spec/ruby_llm/speech_spec.rb new file mode 100644 index 000000000..4ef8a8b95 --- /dev/null +++ b/spec/ruby_llm/speech_spec.rb @@ -0,0 +1,54 @@ +# frozen_string_literal: true + +require 'spec_helper' +require 'tempfile' + +def save_and_verify_audio(audio) + # Create a temp file to save to + temp_file = Tempfile.new(['audio', '.mp3']) + temp_path = temp_file.path + temp_file.close + + begin + saved_path = audio.save(temp_path) + expect(saved_path).to eq(temp_path) + expect(File.exist?(temp_path)).to be true + + file_size = File.size(temp_path) + expect(file_size).to be > 1000 # Any real audio should be larger than 1KB + ensure + # Clean up + File.delete(temp_path) + end +end + +RSpec.describe RubyLLM::Speech do + include_context 'with configured RubyLLM' + + describe 'basic functionality' do + SPEECH_MODELS.each do |config| + provider = config[:provider] + model = config[:model] + + it "#{provider}/#{model} can generate audio from text" do + voice = provider == :gemini ? 'Sadachbia' : 'alloy' + audio = RubyLLM.tts( + 'Hello, welcome!', + model: model, + provider: provider, + voice: voice + ) + + expect(audio.model).to eq(model) + + save_and_verify_audio audio + end + end + + it 'validates model existence' do + expect do + RubyLLM.tts('Hello, welcome!', model: 'invalid-audio-model') + end.to raise_error(RubyLLM::ModelNotFoundError) + end + end +end diff --git a/spec/support/models_to_test.rb b/spec/support/models_to_test.rb index 99dc57d90..90fdcaaef 100644 --- a/spec/support/models_to_test.rb +++ b/spec/support/models_to_test.rb @@ -57,3 +57,8 @@ { provider: :gemini, model: 'gemini-2.5-flash' }, { provider: :vertexai, model: 'gemini-2.5-flash' } ].freeze + +SPEECH_MODELS = [ + { provider: :openai, model: 'gpt-4o-mini-tts' }, + { provider: :gemini, model: 'gemini-2.5-flash-preview-tts' } +].freeze