Add models supports_video? helper; Docs: Clarify vision vs video support

arnodirlam · altxtech · commit 4a8a5f5a3474 · 2025-09-11T17:20:43.000-03:00
diff --git a/README.md b/README.md
@@ -34,13 +34,14 @@ RubyLLM gives you one beautiful API for all of them. Same interface whether you'
 
 ```ruby
 # Just ask questions
-chat = RubyLLM.chat
+chat = RubyLLM.chat(model: "gemini-2.0-flash")
 chat.ask "What's the best way to learn Ruby?"
 ```
 
 ```ruby
 # Analyze any file type
 chat.ask "What's in this image?", with: "ruby_conf.jpg"
+chat.ask "What's happening in this video?", with: "presentation.mp4"
 chat.ask "Describe this meeting", with: "meeting.wav"
 chat.ask "Summarize this document", with: "contract.pdf"
 chat.ask "Explain this code", with: "app.rb"
diff --git a/lib/ruby_llm/model/info.rb b/lib/ruby_llm/model/info.rb
@@ -56,6 +56,10 @@ def supports_vision?
         modalities.input.include?('image')
       end
 
+      def supports_video?
+        modalities.input.include?('video')
+      end
+
       def supports_functions?
         function_calling?
       end
diff --git a/lib/ruby_llm/providers/gemini/capabilities.rb b/lib/ruby_llm/providers/gemini/capabilities.rb
@@ -52,6 +52,10 @@ def supports_vision?(model_id)
           model_id.match?(/gemini|flash|pro|imagen/)
         end
 
+        def supports_video?(model_id)
+          model_id.match?(/gemini/)
+        end
+
         def supports_functions?(model_id)
           return false if model_id.match?(/text-embedding|embedding-001|aqa|flash-lite|imagen|gemini-2\.0-flash-lite/)
 
@@ -214,10 +218,10 @@ def modalities_for(model_id)
 
           if supports_vision?(model_id)
             modalities[:input] << 'image'
-            modalities[:input] << 'video'
             modalities[:input] << 'pdf'
           end
 
+          modalities[:input] << 'video' if supports_video?(model_id)
           modalities[:input] << 'audio' if model_id.match?(/audio/)
           modalities[:output] << 'embeddings' if model_id.match?(/embedding|gemini-embedding/)
           modalities[:output] = ['image'] if model_id.match?(/imagen/)
diff --git a/spec/ruby_llm/models_spec.rb b/spec/ruby_llm/models_spec.rb
@@ -36,11 +36,17 @@
 
       # There should be models from at least OpenAI and Anthropic
       expect(provider_counts.keys).to include('openai', 'anthropic')
+    end
 
-      # Select only models with vision support
+    it 'filters by vision support' do
       vision_models = RubyLLM.models.select(&:supports_vision?)
       expect(vision_models).to all(have_attributes(supports_vision?: true))
     end
+
+    it 'filters by video support' do
+      video_models = RubyLLM.models.select(&:supports_video?)
+      expect(video_models).to all(have_attributes(supports_video?: true))
+    end
   end
 
   describe 'finding models' do