Skip to content

Commit 7c5bd08

Browse files
authored
multi-modal input (#144)
1 parent c980bc4 commit 7c5bd08

File tree

5 files changed

+180
-60
lines changed

5 files changed

+180
-60
lines changed

docs/examples/python/structured_output.md

Lines changed: 49 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,35 +25,73 @@ Structured Output Example
2525
This example demonstrates how to use structured output with Strands Agents to
2626
get type-safe, validated responses using Pydantic models.
2727
"""
28-
28+
import tempfile
2929
from typing import List, Optional
3030
from pydantic import BaseModel, Field
3131
from strands import Agent
3232

3333
def basic_example():
3434
"""Basic example extracting structured information from text."""
3535
print("\n--- Basic Example ---")
36-
36+
3737
class PersonInfo(BaseModel):
3838
name: str
3939
age: int
4040
occupation: str
4141

4242
agent = Agent()
4343
result = agent.structured_output(
44-
PersonInfo,
44+
PersonInfo,
4545
"John Smith is a 30-year-old software engineer"
4646
)
4747

4848
print(f"Name: {result.name}") # "John Smith"
49-
print(f"Age: {result.age}") # 30
49+
print(f"Age: {result.age}") # 30
50+
print(f"Job: {result.occupation}") # "software engineer"
51+
52+
53+
def multimodal_example():
54+
"""Basic example extracting structured information from a document."""
55+
print("\n--- Multi-Modal Example ---")
56+
57+
class PersonInfo(BaseModel):
58+
name: str
59+
age: int
60+
occupation: str
61+
62+
with tempfile.NamedTemporaryFile() as person_file:
63+
person_file.write(b"John Smith is a 30-year old software engineer")
64+
person_file.flush()
65+
66+
with open(person_file.name, "rb") as fp:
67+
document_bytes = fp.read()
68+
69+
agent = Agent()
70+
result = agent.structured_output(
71+
PersonInfo,
72+
[
73+
{"text": "Please process this application."},
74+
{
75+
"document": {
76+
"format": "txt",
77+
"name": "application",
78+
"source": {
79+
"bytes": document_bytes,
80+
},
81+
},
82+
},
83+
]
84+
)
85+
86+
print(f"Name: {result.name}") # "John Smith"
87+
print(f"Age: {result.age}") # 30
5088
print(f"Job: {result.occupation}") # "software engineer"
5189

5290

5391
def conversation_history_example():
5492
"""Example using conversation history with structured output."""
5593
print("\n--- Conversation History Example ---")
56-
94+
5795
agent = Agent()
5896

5997
# Build up conversation context
@@ -71,7 +109,7 @@ def conversation_history_example():
71109
# Uses existing conversation context with a prompt
72110
print("Extracting structured information from conversation context...")
73111
result = agent.structured_output(CityInfo, "Extract structured information about Paris")
74-
112+
75113
print(f"City: {result.city}")
76114
print(f"Country: {result.country}")
77115
print(f"Population: {result.population}")
@@ -81,7 +119,7 @@ def conversation_history_example():
81119
def complex_nested_model_example():
82120
"""Example handling complex nested data structures."""
83121
print("\n--- Complex Nested Model Example ---")
84-
122+
85123
class Address(BaseModel):
86124
street: str
87125
city: str
@@ -117,11 +155,12 @@ def complex_nested_model_example():
117155

118156
if __name__ == "__main__":
119157
print("Structured Output Examples\n")
120-
158+
121159
basic_example()
160+
multimodal_example()
122161
conversation_history_example()
123162
complex_nested_model_example()
124-
163+
125164
print("\nExamples completed.")
126165
```
127166

@@ -143,4 +182,4 @@ The `structured_output()` method ensures that the language model generates a res
143182

144183
## Learn More
145184

146-
For more details on structured output, see the [Structured Output documentation](../../user-guide/concepts/agents/structured-output.md).
185+
For more details on structured output, see the [Structured Output documentation](../../user-guide/concepts/agents/structured-output.md).

docs/examples/python/structured_output.py

Lines changed: 49 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
This example demonstrates how to use structured output with Strands Agents to
66
get type-safe, validated responses using Pydantic models.
77
"""
8+
import tempfile
89

910
from typing import List, Optional
1011
from pydantic import BaseModel, Field
@@ -13,27 +14,65 @@
1314
def basic_example():
1415
"""Basic example extracting structured information from text."""
1516
print("\n--- Basic Example ---")
16-
17+
1718
class PersonInfo(BaseModel):
1819
name: str
1920
age: int
2021
occupation: str
2122

2223
agent = Agent()
2324
result = agent.structured_output(
24-
PersonInfo,
25+
PersonInfo,
2526
"John Smith is a 30-year-old software engineer"
2627
)
2728

2829
print(f"Name: {result.name}") # "John Smith"
29-
print(f"Age: {result.age}") # 30
30+
print(f"Age: {result.age}") # 30
31+
print(f"Job: {result.occupation}") # "software engineer"
32+
33+
34+
def multimodal_example():
35+
"""Basic example extracting structured information from a document."""
36+
print("\n--- Multi-Modal Example ---")
37+
38+
class PersonInfo(BaseModel):
39+
name: str
40+
age: int
41+
occupation: str
42+
43+
with tempfile.NamedTemporaryFile() as person_file:
44+
person_file.write(b"John Smith is a 30-year old software engineer")
45+
person_file.flush()
46+
47+
with open(person_file.name, "rb") as fp:
48+
document_bytes = fp.read()
49+
50+
agent = Agent()
51+
result = agent.structured_output(
52+
PersonInfo,
53+
[
54+
{"text": "Please process this application."},
55+
{
56+
"document": {
57+
"format": "txt",
58+
"name": "application",
59+
"source": {
60+
"bytes": document_bytes,
61+
},
62+
},
63+
},
64+
]
65+
)
66+
67+
print(f"Name: {result.name}") # "John Smith"
68+
print(f"Age: {result.age}") # 30
3069
print(f"Job: {result.occupation}") # "software engineer"
3170

3271

3372
def conversation_history_example():
3473
"""Example using conversation history with structured output."""
3574
print("\n--- Conversation History Example ---")
36-
75+
3776
agent = Agent()
3877

3978
# Build up conversation context
@@ -51,7 +90,7 @@ class CityInfo(BaseModel):
5190
# Uses existing conversation context with a prompt
5291
print("Extracting structured information from conversation context...")
5392
result = agent.structured_output(CityInfo, "Extract structured information about Paris")
54-
93+
5594
print(f"City: {result.city}")
5695
print(f"Country: {result.country}")
5796
print(f"Population: {result.population}")
@@ -61,7 +100,7 @@ class CityInfo(BaseModel):
61100
def complex_nested_model_example():
62101
"""Example handling complex nested data structures."""
63102
print("\n--- Complex Nested Model Example ---")
64-
103+
65104
class Address(BaseModel):
66105
street: str
67106
city: str
@@ -97,9 +136,10 @@ class Person(BaseModel):
97136

98137
if __name__ == "__main__":
99138
print("Structured Output Examples\n")
100-
139+
101140
basic_example()
141+
multimodal_example()
102142
conversation_history_example()
103143
complex_nested_model_example()
104-
105-
print("\nExamples completed.")
144+
145+
print("\nExamples completed.")

docs/user-guide/concepts/agents/prompts.md

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,23 +24,46 @@ If you do not specify a system prompt, the model will behave according to its de
2424

2525
These are your queries or requests to the agent. The SDK supports multiple techniques for prompting.
2626

27-
### Direct Prompting
27+
### Text Prompt
2828

29-
The simplest way to interact with an agent is through direct prompting:
29+
The simplest way to interact with an agent is through a text prompt:
3030

3131
```python
3232
response = agent("What is the time in Seattle")
3333
```
3434

35+
### Multi-Modal Prompting
36+
The SDK also supports multi-modal prompts, allowing you to include images, documents, and other content types in your messages:
37+
38+
```python
39+
with open("path/to/image.png", "rb") as fp:
40+
image_bytes = fp.read()
41+
42+
response = agent([
43+
{"text": "What can you see in this image?"},
44+
{
45+
"image": {
46+
"format": "png",
47+
"source": {
48+
"bytes": image_bytes,
49+
},
50+
},
51+
},
52+
])
53+
```
54+
55+
For a complete list of supported content types, please refer to the [API Reference](../../../api-reference/types.md#strands.types.content.ContentBlock).
56+
57+
3558
### Direct Tool Calls
3659

37-
For programmatic control, you can call tools directly:
60+
Prompting is a primary functionality of Strands that allows you to invoke tools through natural language requests. However, if at any point you require more programmatic control, Strands also allows you to invoke tools directly:
3861

3962
```python
4063
result = agent.tool.current_time(timezone="US/Pacific")
4164
```
4265

43-
This bypasses the natural language interface and directly executes the tool with the specified parameters. By default, direct tool calls are added to the [session state](state-sessions.md) but can be optionally not included by specifying `record_direct_tool_call=False`.
66+
This bypasses the natural language interface and directly executes the tool with the specified parameters. By default, direct tool calls are added to the [session state](state-sessions.md) but can be optionally excluded by specifying `record_direct_tool_call=False`.
4467

4568
## Prompt Engineering
4669

@@ -52,4 +75,4 @@ Further resources:
5275
* [Amazon Bedrock - Prompt engineering concepts](https://docs.aws.amazon.com/bedrock/latest/userguide/prompt-engineering-guidelines.html)
5376
* [Llama - Prompting](https://www.llama.com/docs/how-to-guides/prompting/)
5477
* [Anthropic - Prompt engineering overview](https://docs.anthropic.com/en/docs/build-with-claude/prompt-engineering/overview)
55-
* [OpenAI - Prompt engineering](https://platform.openai.com/docs/guides/prompt-engineering/six-strategies-for-getting-better-results)
78+
* [OpenAI - Prompt engineering](https://platform.openai.com/docs/guides/prompt-engineering/six-strategies-for-getting-better-results)

docs/user-guide/concepts/agents/structured-output.md

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ flowchart LR
1414
direction TB
1515
C[convert_pydantic_to_tool_spec] --> D[LLM Response]
1616
end
17-
17+
1818
B --> Process
1919
Process --> E[Validated Pydantic Model]
2020
```
@@ -69,7 +69,7 @@ class PersonInfo(BaseModel):
6969

7070
agent = Agent()
7171
result = agent.structured_output(
72-
PersonInfo,
72+
PersonInfo,
7373
"John Smith is a 30-year-old software engineer"
7474
)
7575

@@ -78,6 +78,40 @@ print(f"Age: {result.age}") # 30
7878
print(f"Job: {result.occupation}") # "software engineer"
7979
```
8080

81+
### Multi-Modal Input
82+
83+
Extract structured information from prompts containing images, documents, and other content types:
84+
85+
```python
86+
class PersonInfo(BaseModel):
87+
name: str
88+
age: int
89+
occupation: str
90+
91+
with open("path/to/document.pdf", "rb") as fp:
92+
document_bytes = fp.read()
93+
94+
agent = Agent()
95+
result = agent.structured_output(
96+
PersonInfo,
97+
[
98+
{"text": "Please process this application."},
99+
{
100+
"document": {
101+
"format": "pdf",
102+
"name": "application",
103+
"source": {
104+
"bytes": document_bytes,
105+
},
106+
},
107+
},
108+
]
109+
)
110+
```
111+
112+
For a complete list of supported content types, please refer to the [API Reference](../../../api-reference/types.md#strands.types.content.ContentBlock).
113+
114+
81115
### Using Conversation History
82116

83117
Structured output can work with existing conversation context:
@@ -159,7 +193,6 @@ except ValidationError as e:
159193
# 3. Extract partial information from the error
160194
```
161195

162-
163196
## Best Practices
164197

165198
- **Keep models focused**: Define specific models for clear purposes

0 commit comments

Comments
 (0)