Merge branch 'main' into feature/existing-iam-role-template

marvliet · web-flow · commit b94d0e768d7f · 2025-11-21T11:12:42.000+02:00
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,18 @@
+version: '3.8'
+
+services:
+  bedrock-access-gateway:
+    build:
+      context: ./src
+      dockerfile: Dockerfile_ecs
+    ports:
+      - "127.0.0.1:8000:8080"
+    environment:
+      - ENABLE_PROMPT_CACHING=true
+      - API_KEY=${OPENAI_API_KEY}
+      - AWS_PROFILE
+      - AWS_ACCESS_KEY_ID
+      - AWS_SECRET_ACCESS_KEY
+      - AWS_SESSION_TOKEN
+    volumes:
+      - ${HOME}/.aws:/home/appuser/.aws
diff --git a/scripts/push-to-ecr.sh b/scripts/push-to-ecr.sh
@@ -7,6 +7,9 @@ set -o errexit  # exit on first error
 set -o nounset  # exit on using unset variables
 set -o pipefail # exit on any error in a pipeline
 
+# Change to the directory where the script is located
+cd "$(dirname "$0")"
+
 # Prompt user for inputs
 echo "================================================"
 echo "Bedrock Access Gateway - Build and Push to ECR"
diff --git a/src/Dockerfile_ecs b/src/Dockerfile_ecs
@@ -21,6 +21,6 @@ RUN python3 -c 'import tiktoken_ext.openai_public as tke; tke.cl100k_base()'
 ENV PORT=8080
 
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
-  CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8080/health').read()"
+  CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:${PORT}/health').read()"
 
 CMD ["sh", "-c", "uvicorn api.app:app --host 0.0.0.0 --port ${PORT}"]
diff --git a/src/api/models/bedrock.py b/src/api/models/bedrock.py
@@ -964,11 +964,13 @@ def _create_response_stream(
         finish_reason = None
         message = None
         usage = None
+
         if "messageStart" in chunk:
             message = ChatResponseMessage(
                 role=chunk["messageStart"]["role"],
                 content="",
             )
+
         if "contentBlockStart" in chunk:
             # tool call start
             delta = chunk["contentBlockStart"]["start"]
@@ -988,25 +990,30 @@ def _create_response_stream(
                         )
                     ]
                 )
+
         if "contentBlockDelta" in chunk:
             delta = chunk["contentBlockDelta"]["delta"]
             if "text" in delta:
-                # stream content
-                message = ChatResponseMessage(
-                    content=delta["text"],
-                )
+                # Regular text content - close thinking tag if open
+                content = delta["text"]
+                if self.think_emitted:
+                    # Transition from reasoning to regular text
+                    content = "</think>" + content
+                    self.think_emitted = False
+                message = ChatResponseMessage(content=content)
             elif "reasoningContent" in delta:
                 if "text" in delta["reasoningContent"]:
                     content = delta["reasoningContent"]["text"]
                     if not self.think_emitted:
-                        # Port of "content_block_start" with "thinking"
+                        # Start of reasoning content
                         content = "<think>" + content
                         self.think_emitted = True
                     message = ChatResponseMessage(content=content)
                 elif "signature" in delta["reasoningContent"]:
-                    # Port of "signature_delta"
+                    # Port of "signature_delta" (for models that send it)
                     if self.think_emitted:
-                        message = ChatResponseMessage(content="\n </think> \n\n")
+                        message = ChatResponseMessage(content="</think>")
+                        self.think_emitted = False
                     else:
                         return None  # Ignore signature if no <think> started
             else:
@@ -1022,7 +1029,23 @@ def _create_response_stream(
                         )
                     ]
                 )
+
         if "messageStop" in chunk:
+            # Safety check: Close any open thinking tags before message stops
+            if self.think_emitted:
+                self.think_emitted = False
+                return ChatStreamResponse(
+                    id=message_id,
+                    model=model_id,
+                    choices=[
+                        ChoiceDelta(
+                            index=0,
+                            delta=ChatResponseMessage(content="</think>"),
+                            logprobs=None,
+                            finish_reason=None,
+                        )
+                    ],
+                )
             message = ChatResponseMessage()
             finish_reason = chunk["messageStop"]["stopReason"]
 
@@ -1063,6 +1086,7 @@ def _create_response_stream(
                         prompt_tokens_details=prompt_tokens_details,
                     ),
                 )
+
         if message:
             return ChatStreamResponse(
                 id=message_id,