multi-modal input (#144)

pgrayy · web-flow · commit 7c5bd089e4bb · 2025-07-14T13:08:01.000-04:00
diff --git a/docs/examples/python/structured_output.md b/docs/examples/python/structured_output.md
@@ -25,35 +25,73 @@ Structured Output Example
 This example demonstrates how to use structured output with Strands Agents to
 get type-safe, validated responses using Pydantic models.
 """
-
+import tempfile
 from typing import List, Optional
 from pydantic import BaseModel, Field
 from strands import Agent
 
 def basic_example():
     """Basic example extracting structured information from text."""
     print("\n--- Basic Example ---")
-    
+
     class PersonInfo(BaseModel):
         name: str
         age: int
         occupation: str
 
     agent = Agent()
     result = agent.structured_output(
-        PersonInfo, 
+        PersonInfo,
         "John Smith is a 30-year-old software engineer"
     )
 
     print(f"Name: {result.name}")      # "John Smith"
-    print(f"Age: {result.age}")        # 30 
+    print(f"Age: {result.age}")        # 30
+    print(f"Job: {result.occupation}") # "software engineer"
+
+
+def multimodal_example():
+    """Basic example extracting structured information from a document."""
+    print("\n--- Multi-Modal Example ---")
+
+    class PersonInfo(BaseModel):
+        name: str
+        age: int
+        occupation: str
+
+    with tempfile.NamedTemporaryFile() as person_file:
+        person_file.write(b"John Smith is a 30-year old software engineer")
+        person_file.flush()
+
+        with open(person_file.name, "rb") as fp:
+            document_bytes = fp.read()
+
+    agent = Agent()
+    result = agent.structured_output(
+        PersonInfo,
+        [
+            {"text": "Please process this application."},
+            {
+                "document": {
+                    "format": "txt",
+                    "name": "application",
+                    "source": {
+                        "bytes": document_bytes,
+                    },
+                },
+            },
+        ]
+    )
+
+    print(f"Name: {result.name}")      # "John Smith"
+    print(f"Age: {result.age}")        # 30
     print(f"Job: {result.occupation}") # "software engineer"
 
 
 def conversation_history_example():
     """Example using conversation history with structured output."""
     print("\n--- Conversation History Example ---")
-    
+
     agent = Agent()
 
     # Build up conversation context
@@ -71,7 +109,7 @@ def conversation_history_example():
     # Uses existing conversation context with a prompt
     print("Extracting structured information from conversation context...")
     result = agent.structured_output(CityInfo, "Extract structured information about Paris")
-    
+
     print(f"City: {result.city}")
     print(f"Country: {result.country}")
     print(f"Population: {result.population}")
@@ -81,7 +119,7 @@ def conversation_history_example():
 def complex_nested_model_example():
     """Example handling complex nested data structures."""
     print("\n--- Complex Nested Model Example ---")
-    
+
     class Address(BaseModel):
         street: str
         city: str
@@ -117,11 +155,12 @@ def complex_nested_model_example():
 
 if __name__ == "__main__":
     print("Structured Output Examples\n")
-    
+
     basic_example()
+    multimodal_example()
     conversation_history_example()
     complex_nested_model_example()
-    
+
     print("\nExamples completed.")
 ```
 
@@ -143,4 +182,4 @@ The `structured_output()` method ensures that the language model generates a res
 
 ## Learn More
 
-For more details on structured output, see the [Structured Output documentation](../../user-guide/concepts/agents/structured-output.md).
+For more details on structured output, see the [Structured Output documentation](../../user-guide/concepts/agents/structured-output.md).
diff --git a/docs/examples/python/structured_output.py b/docs/examples/python/structured_output.py
@@ -5,6 +5,7 @@
 This example demonstrates how to use structured output with Strands Agents to
 get type-safe, validated responses using Pydantic models.
 """
+import tempfile
 
 from typing import List, Optional
 from pydantic import BaseModel, Field
@@ -13,27 +14,65 @@
 def basic_example():
     """Basic example extracting structured information from text."""
     print("\n--- Basic Example ---")
-    
+
     class PersonInfo(BaseModel):
         name: str
         age: int
         occupation: str
 
     agent = Agent()
     result = agent.structured_output(
-        PersonInfo, 
+        PersonInfo,
         "John Smith is a 30-year-old software engineer"
     )
 
     print(f"Name: {result.name}")      # "John Smith"
-    print(f"Age: {result.age}")        # 30 
+    print(f"Age: {result.age}")        # 30
+    print(f"Job: {result.occupation}") # "software engineer"
+
+
+def multimodal_example():
+    """Basic example extracting structured information from a document."""
+    print("\n--- Multi-Modal Example ---")
+
+    class PersonInfo(BaseModel):
+        name: str
+        age: int
+        occupation: str
+
+    with tempfile.NamedTemporaryFile() as person_file:
+        person_file.write(b"John Smith is a 30-year old software engineer")
+        person_file.flush()
+
+        with open(person_file.name, "rb") as fp:
+            document_bytes = fp.read()
+
+    agent = Agent()
+    result = agent.structured_output(
+        PersonInfo,
+        [
+            {"text": "Please process this application."},
+            {
+                "document": {
+                    "format": "txt",
+                    "name": "application",
+                    "source": {
+                        "bytes": document_bytes,
+                    },
+                },
+            },
+        ]
+    )
+
+    print(f"Name: {result.name}")      # "John Smith"
+    print(f"Age: {result.age}")        # 30
     print(f"Job: {result.occupation}") # "software engineer"
 
 
 def conversation_history_example():
     """Example using conversation history with structured output."""
     print("\n--- Conversation History Example ---")
-    
+
     agent = Agent()
 
     # Build up conversation context
@@ -51,7 +90,7 @@ class CityInfo(BaseModel):
     # Uses existing conversation context with a prompt
     print("Extracting structured information from conversation context...")
     result = agent.structured_output(CityInfo, "Extract structured information about Paris")
-    
+
     print(f"City: {result.city}")
     print(f"Country: {result.country}")
     print(f"Population: {result.population}")
@@ -61,7 +100,7 @@ class CityInfo(BaseModel):
 def complex_nested_model_example():
     """Example handling complex nested data structures."""
     print("\n--- Complex Nested Model Example ---")
-    
+
     class Address(BaseModel):
         street: str
         city: str
@@ -97,9 +136,10 @@ class Person(BaseModel):
 
 if __name__ == "__main__":
     print("Structured Output Examples\n")
-    
+
     basic_example()
+    multimodal_example()
     conversation_history_example()
     complex_nested_model_example()
-    
-    print("\nExamples completed.")
+
+    print("\nExamples completed.")
diff --git a/docs/user-guide/concepts/agents/prompts.md b/docs/user-guide/concepts/agents/prompts.md
@@ -24,23 +24,46 @@ If you do not specify a system prompt, the model will behave according to its de
 
 These are your queries or requests to the agent. The SDK supports multiple techniques for prompting.
 
-### Direct Prompting
+### Text Prompt
 
-The simplest way to interact with an agent is through direct prompting:
+The simplest way to interact with an agent is through a text prompt:
 
 ```python
 response = agent("What is the time in Seattle")
 ```
 
+### Multi-Modal Prompting
+The SDK also supports multi-modal prompts, allowing you to include images, documents, and other content types in your messages:
+
+```python
+with open("path/to/image.png", "rb") as fp:
+    image_bytes = fp.read()
+
+response = agent([
+    {"text": "What can you see in this image?"},
+    {
+        "image": {
+            "format": "png",
+            "source": {
+                "bytes": image_bytes,
+            },
+        },
+    },
+])
+```
+
+For a complete list of supported content types, please refer to the [API Reference](../../../api-reference/types.md#strands.types.content.ContentBlock).
+
+
 ### Direct Tool Calls
 
-For programmatic control, you can call tools directly:
+Prompting is a primary functionality of Strands that allows you to invoke tools through natural language requests. However, if at any point you require more programmatic control, Strands also allows you to invoke tools directly:
 
 ```python
 result = agent.tool.current_time(timezone="US/Pacific")
 ```
 
-This bypasses the natural language interface and directly executes the tool with the specified parameters. By default, direct tool calls are added to the [session state](state-sessions.md) but can be optionally not included by specifying `record_direct_tool_call=False`.
+This bypasses the natural language interface and directly executes the tool with the specified parameters. By default, direct tool calls are added to the [session state](state-sessions.md) but can be optionally excluded by specifying `record_direct_tool_call=False`.
 
 ## Prompt Engineering
 
@@ -52,4 +75,4 @@ Further resources:
 * [Amazon Bedrock - Prompt engineering concepts](https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-engineering-guidelines.html)
 * [Llama - Prompting](https://www.llama.com/docs/how-to-guides/prompting/)
 * [Anthropic - Prompt engineering overview](https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering/overview)
-* [OpenAI - Prompt engineering](https://platform.openai.com/docs/guides/prompt-engineering/six-strategies-for-getting-better-results)
+* [OpenAI - Prompt engineering](https://platform.openai.com/docs/guides/prompt-engineering/six-strategies-for-getting-better-results)
diff --git a/docs/user-guide/concepts/agents/structured-output.md b/docs/user-guide/concepts/agents/structured-output.md
@@ -14,7 +14,7 @@ flowchart LR
         direction TB
         C[convert_pydantic_to_tool_spec] --> D[LLM Response]
     end
-    
+
     B --> Process
     Process --> E[Validated Pydantic Model]
 ```
@@ -69,7 +69,7 @@ class PersonInfo(BaseModel):
 
 agent = Agent()
 result = agent.structured_output(
-    PersonInfo, 
+    PersonInfo,
     "John Smith is a 30-year-old software engineer"
 )
 
@@ -78,6 +78,40 @@ print(f"Age: {result.age}")        # 30
 print(f"Job: {result.occupation}") # "software engineer"
 ```
 
+### Multi-Modal Input
+
+Extract structured information from prompts containing images, documents, and other content types:
+
+```python
+class PersonInfo(BaseModel):
+    name: str
+    age: int
+    occupation: str
+
+with open("path/to/document.pdf", "rb") as fp:
+    document_bytes = fp.read()
+
+agent = Agent()
+result = agent.structured_output(
+    PersonInfo,
+    [
+        {"text": "Please process this application."},
+        {
+            "document": {
+                "format": "pdf",
+                "name": "application",
+                "source": {
+                    "bytes": document_bytes,
+                },
+            },
+        },
+    ]
+)
+```
+
+For a complete list of supported content types, please refer to the [API Reference](../../../api-reference/types.md#strands.types.content.ContentBlock).
+
+
 ### Using Conversation History
 
 Structured output can work with existing conversation context:
@@ -159,7 +193,6 @@ except ValidationError as e:
     # 3. Extract partial information from the error
 ```
 
-
 ## Best Practices
 
 - **Keep models focused**: Define specific models for clear purposes
diff --git a/docs/user-guide/concepts/model-providers/amazon-bedrock.md b/docs/user-guide/concepts/model-providers/amazon-bedrock.md