From 099cc13eddd02582a5de9b433f18cdbc7c316d51 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 7 Nov 2025 21:31:45 +0100
Subject: [PATCH 1/8] feat: respect context

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/cpp/llama-cpp/grpc-server.cpp | 70 +++++++++++++++++++++++----
 core/http/endpoints/openai/chat.go    |  5 ++
 core/http/middleware/request.go       |  3 +-
 pkg/grpc/client.go                    | 11 +++++
 4 files changed, 79 insertions(+), 10 deletions(-)

diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
index a33dc5c20da3..790032d60316 100644
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -822,6 +822,12 @@ class BackendServiceImpl final : public backend::Backend::Service {
         }
 
         ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool {
+            // Check if context is cancelled before processing result
+            if (context->IsCancelled()) {
+                ctx_server.cancel_tasks(task_ids);
+                return false;
+            }
+
             json res_json = result->to_json();
             if (res_json.is_array()) {
                 for (const auto & res : res_json) {
@@ -875,13 +881,18 @@ class BackendServiceImpl final : public backend::Backend::Service {
             reply.set_message(error_data.value("content", ""));
             writer->Write(reply);
             return true;
-        }, [&]() {
-            // NOTE: we should try to check when the writer is closed here
-            return false;
+        }, [&context]() {
+            // Check if the gRPC context is cancelled
+            return context->IsCancelled();
         });
 
         ctx_server.queue_results.remove_waiting_task_ids(task_ids);
 
+        // Check if context was cancelled during processing
+        if (context->IsCancelled()) {
+            return grpc::Status(grpc::StatusCode::CANCELLED, "Request cancelled by client");
+        }
+
         return grpc::Status::OK;
     }
 
@@ -1145,6 +1156,14 @@ class BackendServiceImpl final : public backend::Backend::Service {
 
 
         std::cout << "[DEBUG] Waiting for results..." << std::endl;
+        
+        // Check cancellation before waiting for results
+        if (context->IsCancelled()) {
+            ctx_server.cancel_tasks(task_ids);
+            ctx_server.queue_results.remove_waiting_task_ids(task_ids);
+            return grpc::Status(grpc::StatusCode::CANCELLED, "Request cancelled by client");
+        }
+
         ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
             std::cout << "[DEBUG] Received " << results.size() << " results" << std::endl;
             if (results.size() == 1) {
@@ -1176,13 +1195,20 @@ class BackendServiceImpl final : public backend::Backend::Service {
         }, [&](const json & error_data) {
             std::cout << "[DEBUG] Error in results: " << error_data.value("content", "") << std::endl;
             reply->set_message(error_data.value("content", ""));
-        }, [&]() {
-            return false;
+        }, [&context]() {
+            // Check if the gRPC context is cancelled
+            // This is checked every HTTP_POLLING_SECONDS (1 second) during receive_multi_results
+            return context->IsCancelled();
         });
 
         ctx_server.queue_results.remove_waiting_task_ids(task_ids);
         std::cout << "[DEBUG] Predict request completed successfully" << std::endl;
 
+        // Check if context was cancelled during processing
+        if (context->IsCancelled()) {
+            return grpc::Status(grpc::StatusCode::CANCELLED, "Request cancelled by client");
+        }
+
         return grpc::Status::OK;
     }
 
@@ -1234,6 +1260,13 @@ class BackendServiceImpl final : public backend::Backend::Service {
             ctx_server.queue_tasks.post(std::move(tasks));
         }
 
+        // Check cancellation before waiting for results
+        if (context->IsCancelled()) {
+            ctx_server.cancel_tasks(task_ids);
+            ctx_server.queue_results.remove_waiting_task_ids(task_ids);
+            return grpc::Status(grpc::StatusCode::CANCELLED, "Request cancelled by client");
+        }
+
         // get the result
         ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
             for (auto & res : results) {
@@ -1242,12 +1275,18 @@ class BackendServiceImpl final : public backend::Backend::Service {
             }
         }, [&](const json & error_data) {
             error = true;
-        }, [&]() {
-            return false;
+        }, [&context]() {
+            // Check if the gRPC context is cancelled
+            return context->IsCancelled();
         });
 
         ctx_server.queue_results.remove_waiting_task_ids(task_ids);
 
+        // Check if context was cancelled during processing
+        if (context->IsCancelled()) {
+            return grpc::Status(grpc::StatusCode::CANCELLED, "Request cancelled by client");
+        }
+
         if (error) {
             return grpc::Status(grpc::StatusCode::INTERNAL, "Error in receiving results");
         }
@@ -1325,6 +1364,13 @@ class BackendServiceImpl final : public backend::Backend::Service {
             ctx_server.queue_tasks.post(std::move(tasks));
         }
 
+        // Check cancellation before waiting for results
+        if (context->IsCancelled()) {
+            ctx_server.cancel_tasks(task_ids);
+            ctx_server.queue_results.remove_waiting_task_ids(task_ids);
+            return grpc::Status(grpc::StatusCode::CANCELLED, "Request cancelled by client");
+        }
+
         // Get the results
         ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
             for (auto & res : results) {
@@ -1333,12 +1379,18 @@ class BackendServiceImpl final : public backend::Backend::Service {
             }
         }, [&](const json & error_data) {
             error = true;
-        }, [&]() {
-            return false;
+        }, [&context]() {
+            // Check if the gRPC context is cancelled
+            return context->IsCancelled();
         });
 
         ctx_server.queue_results.remove_waiting_task_ids(task_ids);
 
+        // Check if context was cancelled during processing
+        if (context->IsCancelled()) {
+            return grpc::Status(grpc::StatusCode::CANCELLED, "Request cancelled by client");
+        }
+
         if (error) {
             return grpc::Status(grpc::StatusCode::INTERNAL, "Error in receiving results");
         }
diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index d1ce156215c4..17d2527e070d 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -358,6 +358,11 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			LOOP:
 				for {
 					select {
+					case <-input.Context.Done():
+						// Context was cancelled (client disconnected or request cancelled)
+						log.Debug().Msgf("Request context cancelled, stopping stream")
+						input.Cancel()
+						break LOOP
 					case ev := <-responses:
 						if len(ev.Choices) == 0 {
 							log.Debug().Msgf("No choices in the response, skipping")
diff --git a/core/http/middleware/request.go b/core/http/middleware/request.go
index 35f39f7f37f9..98e89fc73586 100644
--- a/core/http/middleware/request.go
+++ b/core/http/middleware/request.go
@@ -161,7 +161,8 @@ func (re *RequestExtractor) SetOpenAIRequest(ctx *fiber.Ctx) error {
 	correlationID := ctx.Get("X-Correlation-ID", uuid.New().String())
 	ctx.Set("X-Correlation-ID", correlationID)
 
-	c1, cancel := context.WithCancel(re.applicationConfig.Context)
+	//c1, cancel := context.WithCancel(re.applicationConfig.Context)
+	c1, cancel := context.WithCancel(ctx.Context())
 	// Add the correlation ID to the new context
 	ctxWithCorrelationID := context.WithValue(c1, CorrelationIDKey, correlationID)
 
diff --git a/pkg/grpc/client.go b/pkg/grpc/client.go
index f0f9a930efeb..ff5dccb41232 100644
--- a/pkg/grpc/client.go
+++ b/pkg/grpc/client.go
@@ -178,11 +178,22 @@ func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f fun
 	}
 
 	for {
+		// Check if context is cancelled before receiving
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		default:
+		}
+
 		reply, err := stream.Recv()
 		if err == io.EOF {
 			break
 		}
 		if err != nil {
+			// Check if error is due to context cancellation
+			if ctx.Err() != nil {
+				return ctx.Err()
+			}
 			fmt.Println("Error", err)
 
 			return err

From 38125d9646d1113c6077ce288eb8821195cea69a Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sat, 8 Nov 2025 18:27:16 +0100
Subject: [PATCH 2/8] workaround fasthttp

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/http/endpoints/openai/chat.go | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index 17d2527e070d..977f655b568e 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -5,6 +5,7 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
+	"net"
 	"time"
 
 	"github.com/gofiber/fiber/v2"
@@ -516,6 +517,29 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 
 			}
 
+			// NOTE: this is a bad WORKAROUND! We should find a better way to handle this.
+			// Fasthttp doesn't support context cancellation from the caller
+			// for non-streaming requests, so we need to monitor the connection directly.
+			// Monitor connection for client disconnection during non-streaming requests
+			// We access the connection directly via c.Context().Conn() to monitor it
+			// during ComputeChoices execution, not after the response is sent
+			// see: https://github.com/mudler/LocalAI/pull/7187#issuecomment-3506720906
+			var conn net.Conn = c.Context().Conn()
+			if conn != nil {
+				go func() {
+					buf := make([]byte, 1)
+					for {
+						_, err := conn.Read(buf)
+						if err != nil {
+							// Connection closed - cancel the context to stop gRPC call
+							log.Debug().Msgf("Cancelling GRPC call")
+							input.Cancel()
+							return
+						}
+					}
+				}()
+			}
+
 			result, tokenUsage, err := ComputeChoices(
 				input,
 				predInput,

From 38b4a2af2b69f841aa0143d8fcefd814f5d97d7d Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sat, 8 Nov 2025 18:53:40 +0100
Subject: [PATCH 3/8] feat(ui): allow to abort call

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/http/static/chat.js  | 84 +++++++++++++++++++++++++++++++--------
 core/http/views/chat.html | 20 ++++++----
 2 files changed, 79 insertions(+), 25 deletions(-)

diff --git a/core/http/static/chat.js b/core/http/static/chat.js
index 9b10a626e967..4517b0baf5bf 100644
--- a/core/http/static/chat.js
+++ b/core/http/static/chat.js
@@ -27,21 +27,43 @@ SOFTWARE.
 
 */
 
+// Global variable to store the current AbortController
+let currentAbortController = null;
+let currentReader = null;
+
 function toggleLoader(show) {
-  const loader = document.getElementById('loader');
   const sendButton = document.getElementById('send-button');
+  const stopButton = document.getElementById('stop-button');
   
   if (show) {
-    loader.style.display = 'block';
     sendButton.style.display = 'none';
+    stopButton.style.display = 'block';
     document.getElementById("input").disabled = true;
   } else {
     document.getElementById("input").disabled = false;
-    loader.style.display = 'none';
     sendButton.style.display = 'block';
+    stopButton.style.display = 'none';
+    currentAbortController = null;
+    currentReader = null;
   }
 }
 
+function stopRequest() {
+  if (currentAbortController) {
+    currentAbortController.abort();
+    currentAbortController = null;
+  }
+  if (currentReader) {
+    currentReader.cancel();
+    currentReader = null;
+  }
+  toggleLoader(false);
+  Alpine.store("chat").add(
+    "assistant",
+    `<span class='error'>Request cancelled by user</span>`,
+  );
+}
+
 function processThinkingTags(content) {
   const thinkingRegex = /<thinking>(.*?)<\/thinking>|<think>(.*?)<\/think>/gs;
   const parts = content.split(thinkingRegex);
@@ -295,8 +317,9 @@ async function promptGPT(systemPrompt, input) {
   
   let response;
   try {
-    // Create AbortController for timeout handling
+    // Create AbortController for timeout handling and stop button
     const controller = new AbortController();
+    currentAbortController = controller; // Store globally so stop button can abort it
     const timeoutId = setTimeout(() => controller.abort(), mcpMode ? 300000 : 30000); // 5 minutes for MCP, 30 seconds for regular
     
     response = await fetch(endpoint, {
@@ -311,11 +334,20 @@ async function promptGPT(systemPrompt, input) {
     
     clearTimeout(timeoutId);
   } catch (error) {
+    // Don't show error if request was aborted by user (stop button)
     if (error.name === 'AbortError') {
-      Alpine.store("chat").add(
-        "assistant",
-        `<span class='error'>Request timeout: MCP processing is taking longer than expected. Please try again.</span>`,
-      );
+      // Check if this was a user-initiated abort (stop button was clicked)
+      // If currentAbortController is null, it means stopRequest() was called and already handled the UI
+      if (!currentAbortController) {
+        // User clicked stop button - error message already shown by stopRequest()
+        return;
+      } else {
+        // Timeout error (controller was aborted by timeout, not user)
+        Alpine.store("chat").add(
+          "assistant",
+          `<span class='error'>Request timeout: MCP processing is taking longer than expected. Please try again.</span>`,
+        );
+      }
     } else {
       Alpine.store("chat").add(
         "assistant",
@@ -323,6 +355,7 @@ async function promptGPT(systemPrompt, input) {
       );
     }
     toggleLoader(false);
+    currentAbortController = null;
     return;
   }
 
@@ -332,6 +365,7 @@ async function promptGPT(systemPrompt, input) {
       `<span class='error'>Error: POST ${endpoint} ${response.status}</span>`,
     );
     toggleLoader(false);
+    currentAbortController = null;
     return;
   }
 
@@ -360,10 +394,15 @@ async function promptGPT(systemPrompt, input) {
       // Highlight all code blocks
       hljs.highlightAll();
     } catch (error) {
-      Alpine.store("chat").add(
-        "assistant",
-        `<span class='error'>Error: Failed to parse MCP response</span>`,
-      );
+      // Don't show error if request was aborted by user
+      if (error.name !== 'AbortError' || currentAbortController) {
+        Alpine.store("chat").add(
+          "assistant",
+          `<span class='error'>Error: Failed to parse MCP response</span>`,
+        );
+      }
+    } finally {
+      currentAbortController = null;
     }
   } else {
     // Handle regular streaming response
@@ -376,9 +415,13 @@ async function promptGPT(systemPrompt, input) {
         "assistant",
         `<span class='error'>Error: Failed to decode API response</span>`,
       );
+      toggleLoader(false);
       return;
     }
 
+    // Store reader globally so stop button can cancel it
+    currentReader = reader;
+
     // Function to add content to the chat and handle DOM updates efficiently
     const addToChat = (token) => {
       const chatStore = Alpine.store("chat");
@@ -479,13 +522,20 @@ async function promptGPT(systemPrompt, input) {
       // Highlight all code blocks once at the end
       hljs.highlightAll();
     } catch (error) {
-      Alpine.store("chat").add(
-        "assistant",
-        `<span class='error'>Error: Failed to process stream</span>`,
-      );
+      // Don't show error if request was aborted by user
+      if (error.name !== 'AbortError' || !currentAbortController) {
+        Alpine.store("chat").add(
+          "assistant",
+          `<span class='error'>Error: Failed to process stream</span>`,
+        );
+      }
     } finally {
       // Perform any cleanup if necessary
-      reader.releaseLock();
+      if (reader) {
+        reader.releaseLock();
+      }
+      currentReader = null;
+      currentAbortController = null;
     }
   }
 
diff --git a/core/http/views/chat.html b/core/http/views/chat.html
index ff9ed3ee6089..86338402f330 100644
--- a/core/http/views/chat.html
+++ b/core/http/views/chat.html
@@ -402,15 +402,19 @@ <h1 class="text-lg font-semibold">
                   title="Upload text, markdown or PDF file"
                 ></button>
 
-                <!-- Send button and loader in the same position -->
+                <!-- Send button and stop button in the same position -->
                 <div class="absolute right-3 top-4">
-                  <!-- Loader (hidden by default) -->
-                  <div id="loader" class="text-lg p-2" style="display: none;">
-                    <svg class="animate-spin h-5 w-5 text-blue-500" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24">
-                      <circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
-                      <path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
-                    </svg>
-                  </div>
+                  <!-- Stop button (hidden by default, shown when request is in progress) -->
+                  <button
+                    id="stop-button"
+                    type="button"
+                    onclick="stopRequest()"
+                    class="text-lg p-2 text-red-400 hover:text-red-500 transition-colors duration-200"
+                    style="display: none;"
+                    title="Stop request"
+                  >
+                    <i class="fa-solid fa-stop"></i>
+                  </button>
 
                   <!-- Send button -->
                   <button

From 7778af36cb9d5f137d83eaa9aa6cbba499b440e8 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sat, 8 Nov 2025 19:08:10 +0100
Subject: [PATCH 4/8] Refactor

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/http/endpoints/openai/chat.go | 52 +++++++++++++++++-------------
 1 file changed, 30 insertions(+), 22 deletions(-)

diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index 977f655b568e..af966fd3d1a4 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -23,6 +23,33 @@ import (
 	"github.com/valyala/fasthttp"
 )
 
+// NOTE: this is a bad WORKAROUND! We should find a better way to handle this.
+// Fasthttp doesn't support context cancellation from the caller
+// for non-streaming requests, so we need to monitor the connection directly.
+// Monitor connection for client disconnection during non-streaming requests
+// We access the connection directly via c.Context().Conn() to monitor it
+// during ComputeChoices execution, not after the response is sent
+// see: https://github.com/mudler/LocalAI/pull/7187#issuecomment-3506720906
+func handleConnectionCancellation(c *fiber.Ctx, input *schema.OpenAIRequest) {
+	var conn net.Conn = c.Context().Conn()
+	if conn == nil {
+		return
+	}
+
+	go func() {
+		buf := make([]byte, 1)
+		for {
+			_, err := conn.Read(buf)
+			if err != nil {
+				// Connection closed - cancel the context to stop gRPC call
+				log.Debug().Msgf("Cancelling GRPC call")
+				input.Cancel()
+				return
+			}
+		}
+	}()
+}
+
 // ChatEndpoint is the OpenAI Completion API endpoint https://platform.openai.com/docs/api-reference/chat/create
 // @Summary Generate a chat completions for a given prompt and model.
 // @Param request body schema.OpenAIRequest true "query params"
@@ -517,28 +544,9 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 
 			}
 
-			// NOTE: this is a bad WORKAROUND! We should find a better way to handle this.
-			// Fasthttp doesn't support context cancellation from the caller
-			// for non-streaming requests, so we need to monitor the connection directly.
-			// Monitor connection for client disconnection during non-streaming requests
-			// We access the connection directly via c.Context().Conn() to monitor it
-			// during ComputeChoices execution, not after the response is sent
-			// see: https://github.com/mudler/LocalAI/pull/7187#issuecomment-3506720906
-			var conn net.Conn = c.Context().Conn()
-			if conn != nil {
-				go func() {
-					buf := make([]byte, 1)
-					for {
-						_, err := conn.Read(buf)
-						if err != nil {
-							// Connection closed - cancel the context to stop gRPC call
-							log.Debug().Msgf("Cancelling GRPC call")
-							input.Cancel()
-							return
-						}
-					}
-				}()
-			}
+			// NOTE: this is a workaround as fasthttp
+			// context cancellation does not fire in non-streaming requests
+			handleConnectionCancellation(c, input)
 
 			result, tokenUsage, err := ComputeChoices(
 				input,

From c1416a5eb5921bdb2ed8bde5a36944659f8224b7 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sat, 8 Nov 2025 19:57:01 +0100
Subject: [PATCH 5/8] chore: improving error

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/config/model_config.go       | 9 ++++-----
 core/http/endpoints/openai/mcp.go | 7 +++++--
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/core/config/model_config.go b/core/config/model_config.go
index 87fa05fce8ca..1dee82363d2f 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -93,19 +93,18 @@ type AgentConfig struct {
 	EnablePlanReEvaluator bool `yaml:"enable_plan_re_evaluator" json:"enable_plan_re_evaluator"`
 }
 
-func (c *MCPConfig) MCPConfigFromYAML() (MCPGenericConfig[MCPRemoteServers], MCPGenericConfig[MCPSTDIOServers]) {
+func (c *MCPConfig) MCPConfigFromYAML() (MCPGenericConfig[MCPRemoteServers], MCPGenericConfig[MCPSTDIOServers], error) {
 	var remote MCPGenericConfig[MCPRemoteServers]
 	var stdio MCPGenericConfig[MCPSTDIOServers]
 
 	if err := yaml.Unmarshal([]byte(c.Servers), &remote); err != nil {
-		return remote, stdio
+		return remote, stdio, err
 	}
 
 	if err := yaml.Unmarshal([]byte(c.Stdio), &stdio); err != nil {
-		return remote, stdio
+		return remote, stdio, err
 	}
-
-	return remote, stdio
+	return remote, stdio, nil
 }
 
 type MCPGenericConfig[T any] struct {
diff --git a/core/http/endpoints/openai/mcp.go b/core/http/endpoints/openai/mcp.go
index 89d6aa5fa6f6..89b16d0dc8df 100644
--- a/core/http/endpoints/openai/mcp.go
+++ b/core/http/endpoints/openai/mcp.go
@@ -50,12 +50,15 @@ func MCPCompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader,
 		}
 
 		// Get MCP config from model config
-		remote, stdio := config.MCP.MCPConfigFromYAML()
+		remote, stdio, err := config.MCP.MCPConfigFromYAML()
+		if err != nil {
+			return fmt.Errorf("failed to get MCP config: %w", err)
+		}
 
 		// Check if we have tools in cache, or we have to have an initial connection
 		sessions, err := mcpTools.SessionsFromMCPConfig(config.Name, remote, stdio)
 		if err != nil {
-			return err
+			return fmt.Errorf("failed to get MCP sessions: %w", err)
 		}
 
 		if len(sessions) == 0 {

From 50009976156fe8cc1892d510f75a2acc185b779d Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sat, 8 Nov 2025 20:03:24 +0100
Subject: [PATCH 6/8] Respect context also with MCP

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/http/endpoints/openai/chat.go | 8 ++++----
 core/http/endpoints/openai/mcp.go  | 7 ++++++-
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index af966fd3d1a4..15f8bfd28786 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -30,7 +30,7 @@ import (
 // We access the connection directly via c.Context().Conn() to monitor it
 // during ComputeChoices execution, not after the response is sent
 // see: https://github.com/mudler/LocalAI/pull/7187#issuecomment-3506720906
-func handleConnectionCancellation(c *fiber.Ctx, input *schema.OpenAIRequest) {
+func handleConnectionCancellation(c *fiber.Ctx, cancelFunc func()) {
 	var conn net.Conn = c.Context().Conn()
 	if conn == nil {
 		return
@@ -42,8 +42,8 @@ func handleConnectionCancellation(c *fiber.Ctx, input *schema.OpenAIRequest) {
 			_, err := conn.Read(buf)
 			if err != nil {
 				// Connection closed - cancel the context to stop gRPC call
-				log.Debug().Msgf("Cancelling GRPC call")
-				input.Cancel()
+				log.Debug().Msgf("Calling cancellation function")
+				cancelFunc()
 				return
 			}
 		}
@@ -546,7 +546,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 
 			// NOTE: this is a workaround as fasthttp
 			// context cancellation does not fire in non-streaming requests
-			handleConnectionCancellation(c, input)
+			handleConnectionCancellation(c, input.Cancel)
 
 			result, tokenUsage, err := ComputeChoices(
 				input,
diff --git a/core/http/endpoints/openai/mcp.go b/core/http/endpoints/openai/mcp.go
index 89b16d0dc8df..79376512a0fc 100644
--- a/core/http/endpoints/openai/mcp.go
+++ b/core/http/endpoints/openai/mcp.go
@@ -1,6 +1,7 @@
 package openai
 
 import (
+	"context"
 	"encoding/json"
 	"errors"
 	"fmt"
@@ -76,6 +77,10 @@ func MCPCompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader,
 		if appConfig.ApiKeys != nil {
 			apiKey = appConfig.ApiKeys[0]
 		}
+
+		ctxWithCancellation, cancel := context.WithCancel(ctx)
+		defer cancel()
+		handleConnectionCancellation(c, cancel)
 		// TODO: instead of connecting to the API, we should just wire this internally
 		// and act like completion.go.
 		// We can do this as cogito expects an interface and we can create one that
@@ -86,7 +91,7 @@ func MCPCompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader,
 			cogito.WithStatusCallback(func(s string) {
 				log.Debug().Msgf("[model agent] [model: %s] Status: %s", config.Name, s)
 			}),
-			cogito.WithContext(ctx),
+			cogito.WithContext(ctxWithCancellation),
 			cogito.WithMCPs(sessions...),
 			cogito.WithIterations(3),  // default to 3 iterations
 			cogito.WithMaxAttempts(3), // default to 3 attempts

From cf0270d31483aa4f03818f9ef45a8bf3e9b49fee Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sat, 8 Nov 2025 22:22:44 +0100
Subject: [PATCH 7/8] Tie to both contexts

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/http/middleware/request.go | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/core/http/middleware/request.go b/core/http/middleware/request.go
index 98e89fc73586..6ab517f73f78 100644
--- a/core/http/middleware/request.go
+++ b/core/http/middleware/request.go
@@ -162,7 +162,16 @@ func (re *RequestExtractor) SetOpenAIRequest(ctx *fiber.Ctx) error {
 	ctx.Set("X-Correlation-ID", correlationID)
 
 	//c1, cancel := context.WithCancel(re.applicationConfig.Context)
+	// Use the application context as parent to ensure cancellation on app shutdown
+	// We'll monitor the Fiber context separately and cancel our context when the request is canceled
 	c1, cancel := context.WithCancel(ctx.Context())
+	// Monitor the Fiber context and cancel our context when it's canceled
+	// This ensures we respect request cancellation without causing panics
+	go func() {
+		<-ctx.Context().Done()
+		// Fiber context was canceled (request completed or client disconnected)
+		cancel()
+	}()
 	// Add the correlation ID to the new context
 	ctxWithCorrelationID := context.WithValue(c1, CorrelationIDKey, correlationID)
 

From 4839d572931cb247fedeabcb41ccffe991bd48f3 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Sun, 9 Nov 2025 09:23:44 +0100
Subject: [PATCH 8/8] Make detection more robust

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/http/endpoints/openai/chat.go | 41 +++++++++++++++++++++++++-----
 core/http/endpoints/openai/mcp.go  |  2 +-
 core/http/middleware/request.go    |  2 +-
 3 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index 15f8bfd28786..3691e058aa19 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -3,6 +3,7 @@ package openai
 import (
 	"bufio"
 	"bytes"
+	"context"
 	"encoding/json"
 	"fmt"
 	"net"
@@ -30,21 +31,47 @@ import (
 // We access the connection directly via c.Context().Conn() to monitor it
 // during ComputeChoices execution, not after the response is sent
 // see: https://github.com/mudler/LocalAI/pull/7187#issuecomment-3506720906
-func handleConnectionCancellation(c *fiber.Ctx, cancelFunc func()) {
+func handleConnectionCancellation(c *fiber.Ctx, cancelFunc func(), requestCtx context.Context) {
 	var conn net.Conn = c.Context().Conn()
 	if conn == nil {
 		return
 	}
 
 	go func() {
+		defer func() {
+			// Clear read deadline when goroutine exits
+			conn.SetReadDeadline(time.Time{})
+		}()
+
 		buf := make([]byte, 1)
+		// Use a short read deadline to periodically check if connection is closed
+		// Without a deadline, Read() would block indefinitely waiting for data
+		// that will never come (client is waiting for response, not sending more data)
+		ticker := time.NewTicker(100 * time.Millisecond)
+		defer ticker.Stop()
+
 		for {
-			_, err := conn.Read(buf)
-			if err != nil {
-				// Connection closed - cancel the context to stop gRPC call
-				log.Debug().Msgf("Calling cancellation function")
-				cancelFunc()
+			select {
+			case <-requestCtx.Done():
+				// Request completed or was cancelled - exit goroutine
 				return
+			case <-ticker.C:
+				// Set a short deadline - if connection is closed, read will fail immediately
+				// If connection is open but no data, it will timeout and we check again
+				conn.SetReadDeadline(time.Now().Add(50 * time.Millisecond))
+				_, err := conn.Read(buf)
+				if err != nil {
+					// Check if it's a timeout (connection still open, just no data)
+					if netErr, ok := err.(net.Error); ok && netErr.Timeout() {
+						// Timeout is expected - connection is still open, just no data to read
+						// Continue the loop to check again
+						continue
+					}
+					// Connection closed or other error - cancel the context to stop gRPC call
+					log.Debug().Msgf("Calling cancellation function")
+					cancelFunc()
+					return
+				}
 			}
 		}
 	}()
@@ -546,7 +573,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 
 			// NOTE: this is a workaround as fasthttp
 			// context cancellation does not fire in non-streaming requests
-			handleConnectionCancellation(c, input.Cancel)
+			handleConnectionCancellation(c, input.Cancel, input.Context)
 
 			result, tokenUsage, err := ComputeChoices(
 				input,
diff --git a/core/http/endpoints/openai/mcp.go b/core/http/endpoints/openai/mcp.go
index 79376512a0fc..fe018bbbd09c 100644
--- a/core/http/endpoints/openai/mcp.go
+++ b/core/http/endpoints/openai/mcp.go
@@ -80,7 +80,7 @@ func MCPCompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader,
 
 		ctxWithCancellation, cancel := context.WithCancel(ctx)
 		defer cancel()
-		handleConnectionCancellation(c, cancel)
+		handleConnectionCancellation(c, cancel, ctxWithCancellation)
 		// TODO: instead of connecting to the API, we should just wire this internally
 		// and act like completion.go.
 		// We can do this as cogito expects an interface and we can create one that
diff --git a/core/http/middleware/request.go b/core/http/middleware/request.go
index 6ab517f73f78..4ec9613711c2 100644
--- a/core/http/middleware/request.go
+++ b/core/http/middleware/request.go
@@ -164,7 +164,7 @@ func (re *RequestExtractor) SetOpenAIRequest(ctx *fiber.Ctx) error {
 	//c1, cancel := context.WithCancel(re.applicationConfig.Context)
 	// Use the application context as parent to ensure cancellation on app shutdown
 	// We'll monitor the Fiber context separately and cancel our context when the request is canceled
-	c1, cancel := context.WithCancel(ctx.Context())
+	c1, cancel := context.WithCancel(re.applicationConfig.Context)
 	// Monitor the Fiber context and cancel our context when it's canceled
 	// This ensures we respect request cancellation without causing panics
 	go func() {