From d79b3c5be325a3dee3af325c97f2a2ae3c1c89e5 Mon Sep 17 00:00:00 2001
From: Lei <lei.qu@microsoft.com>
Date: Fri, 12 Sep 2025 14:23:44 +0800
Subject: [PATCH 01/34] add: stream output

---
 contrib/copilot-plugin/src/app/ChatBox.tsx    |  77 ++++++++--
 .../copilot-plugin/src/app/ChatHistory.tsx    |   2 +-
 contrib/copilot-plugin/src/libs/state.ts      |  24 +++-
 .../src/copilot_agent/copilot_conversation.py |  44 +++---
 .../src/copilot_agent/copilot_service.py      |  65 +++++++++
 .../src/copilot_agent/copilot_turn.py         |   4 +-
 src/copilot-chat/src/copilot_agent/ltp/ltp.py |   5 +-
 .../src/copilot_agent/utils/__init__.py       |   4 +
 .../src/copilot_agent/utils/llmsession.py     | 131 +++++++++++++++++-
 .../src/copilot_agent/utils/push_frontend.py  |  27 ++++
 .../src/copilot_agent/utils/smart_help.py     |   4 +-
 .../src/copilot_agent/utils/summary.py        |   3 +-
 12 files changed, 353 insertions(+), 37 deletions(-)
 create mode 100644 src/copilot-chat/src/copilot_agent/utils/push_frontend.py

diff --git a/contrib/copilot-plugin/src/app/ChatBox.tsx b/contrib/copilot-plugin/src/app/ChatBox.tsx
index 949edddc..1b4472c1 100644
--- a/contrib/copilot-plugin/src/app/ChatBox.tsx
+++ b/contrib/copilot-plugin/src/app/ChatBox.tsx
@@ -21,8 +21,8 @@ export default function ChatBox() {
   // Use local backend when running the dev server (npm start),
   // and use the relative path for production builds (npm run build).
   const REMOTE_SERVER_URL = process.env.NODE_ENV === 'development'
-    ? 'http://127.0.0.1:60000/copilot/api/operation'
-    : '/copilot/api/operation';
+    ? 'http://127.0.0.1:60000/copilot/api/operation/stream'
+    : '/copilot/api/operation/stream';
 
   const makeChatRequest = async (e: React.FormEvent) => {
     e.preventDefault();
@@ -60,6 +60,8 @@ export default function ChatBox() {
           }
         }
       };
+      // Create assistant placeholder and stream response into it
+      useChatStore.getState().addChat({ role: "assistant", message: "", timestamp: new Date() });
       const response = await fetch(REMOTE_SERVER_URL, {
         method: "POST",
         headers: {
@@ -69,15 +71,70 @@ export default function ChatBox() {
         body: JSON.stringify(payload),
       });
       if (!response.ok) throw new Error("Remote server error");
-      const data = await response.json();
-      if (data?.data?.answer !== "skip") {
-        useChatStore.getState().addChat({
-          role: "assistant",
-          message: data?.data?.answer ?? "No answer found",
-          timestamp: new Date(),
-          messageInfo: data?.data?.message_info, // Store the message_info from response
-        });
+
+      const reader = response.body?.getReader();
+      if (!reader) throw new Error('No response body for streaming');
+      const decoder = new TextDecoder();
+      // Buffer incoming bytes and parse SSE-style messages (separated by '\n\n')
+      let buffer = '';
+      while (true) {
+        const { value, done: readerDone } = await reader.read();
+        if (value) {
+          buffer += decoder.decode(value, { stream: true });
+        }
+
+        // Process all complete SSE messages in buffer
+        let sepIndex;
+        while ((sepIndex = buffer.indexOf('\n\n')) !== -1) {
+          const rawEvent = buffer.slice(0, sepIndex);
+          buffer = buffer.slice(sepIndex + 2);
+
+          // Extract data: lines and join with newline to preserve original formatting
+          const lines = rawEvent.split(/\n/);
+          const dataParts: string[] = [];
+          let isDoneEvent = false;
+          for (const line of lines) {
+            if (line.startsWith('data:')) {
+              dataParts.push(line.slice(5));
+            } else if (line.startsWith('event:')) {
+              const ev = line.slice(6).trim();
+              if (ev === 'done') isDoneEvent = true;
+            }
+          }
+
+          if (dataParts.length > 0) {
+            const dataStr = dataParts.join('\n');
+            // If the server sent a JSON 'append' event, append to last assistant message
+            let handled = false;
+            const trimmed = dataStr.trim();
+            if (trimmed.startsWith('{')) {
+              try {
+                const parsed = JSON.parse(trimmed);
+                if (parsed && parsed.type === 'append' && typeof parsed.text === 'string') {
+                  useChatStore.getState().appendToLastAssistant(parsed.text);
+                  handled = true;
+                }
+              } catch (e) {
+                // not JSON, fall through to full replace
+              }
+            }
+
+            if (!handled) {
+              // Replace the last assistant message with the full reconstructed text
+              useChatStore.getState().replaceLastAssistant(dataStr);
+            }
+          }
+
+          if (isDoneEvent) {
+            // stream finished
+            break;
+          }
+        }
+
+        if (readerDone) break;
       }
+
+      // After the streaming loop, do not alter the assembled markdown so newlines are preserved
     } catch (err) {
       toast.error("Failed to get response from remote server");
     }
diff --git a/contrib/copilot-plugin/src/app/ChatHistory.tsx b/contrib/copilot-plugin/src/app/ChatHistory.tsx
index 5f460d15..df606311 100644
--- a/contrib/copilot-plugin/src/app/ChatHistory.tsx
+++ b/contrib/copilot-plugin/src/app/ChatHistory.tsx
@@ -160,7 +160,7 @@ const Message: React.FC<{ message: ChatMessage, expand?: boolean, isAssistant?:
           messageInfo: {
             userId: paiuser,
             convId: currentConversationId,
-            turnId: uuidv4(), // Use message's turnId or fallback to "0"
+            turnId: message.messageInfo?.turnId || "0",
             timestamp: Math.floor(Date.now()),
             timestampUnit: "ms",
             type: "feedback",
diff --git a/contrib/copilot-plugin/src/libs/state.ts b/contrib/copilot-plugin/src/libs/state.ts
index f5a0605e..2a10d501 100644
--- a/contrib/copilot-plugin/src/libs/state.ts
+++ b/contrib/copilot-plugin/src/libs/state.ts
@@ -55,6 +55,8 @@ interface State {
   setAllModelsInCurrentJob: (models: string[]) => void;
   setCurrentModel: (model: string | null) => void;
   addChat: (chat: ChatMessage) => void;
+  appendToLastAssistant: (chunk: string) => void;
+  replaceLastAssistant: (text: string) => void;
   
   // Conversation management actions
   generateNewConversationId: () => void;
@@ -98,7 +100,27 @@ export const useChatStore = create<State>((set) => ({
   setCurrentModel: (model) => set({ currentModel: model }),
 
   addChat: (log) => set((state) => ({ chatMsgs: [...state.chatMsgs, log] })),
-  
+  appendToLastAssistant: (chunk: string) => set((state) => {
+    const msgs = [...state.chatMsgs];
+    for (let i = msgs.length - 1; i >= 0; i--) {
+      if (msgs[i].role === 'assistant') {
+        msgs[i] = { ...msgs[i], message: (msgs[i].message || '') + chunk };
+        break;
+      }
+    }
+    return { chatMsgs: msgs };
+  },),
+  replaceLastAssistant: (text: string) => set((state) => {
+    const msgs = [...state.chatMsgs];
+    for (let i = msgs.length - 1; i >= 0; i--) {
+      if (msgs[i].role === 'assistant') {
+        msgs[i] = { ...msgs[i], message: text };
+        break;
+      }
+    }
+    return { chatMsgs: msgs };
+  }),
+
   // Generate a new conversation ID (useful for starting a new conversation)
   generateNewConversationId: () => set((state) => ({ 
     currentConversationId: uuidv4(),
diff --git a/src/copilot-chat/src/copilot_agent/copilot_conversation.py b/src/copilot-chat/src/copilot_agent/copilot_conversation.py
index 00ac213a..4cffbde7 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_conversation.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_conversation.py
@@ -5,7 +5,6 @@
 
 import os
 from collections import deque
-import uuid
 import pandas as pd
 from datetime import datetime, timezone
 from typing import Union
@@ -14,6 +13,7 @@
 from .utils.logger import logger
 from .utils.authentication import AuthenticationManager
 from .utils.kql_executor import KustoExecutor
+from .utils.push_frontend import push_frontend_event, push_frontend_meta
 
 from .config import AGENT_MODE_LOCAL, print_env_variables
 from .copilot_turn import CoPilotTurn
@@ -95,23 +95,23 @@ def perform_operation(self, in_parameters: InParameters) -> OutParameters:
 
         # process
         if _is_feedback:
-            user_id, conv_id = self._extract_user_and_conv_id(question_msg_info)
-            result = self._handle_feedback_only(user_id, conv_id)
+            user_id, conv_id, turn_id = self._extract_user_and_conv_id(question_msg_info)
+            result = self._handle_feedback_only(user_id, conv_id, turn_id)
         elif _is_question:
-            user_id, conv_id = self._extract_user_and_conv_id(question_msg_info)
+            user_id, conv_id, turn_id = self._extract_user_and_conv_id(question_msg_info)
             # Authenticate only for user question
             if not self.auth_manager.is_authenticated(username):
                 logger.info(f'User {username} not authenticated, attempting to authenticate...')
                 self.auth_manager.set_authenticate_state(username, rest_token)
                 if not self.auth_manager.is_authenticated(username):
                     logger.error(f'User {username} failed authentication twice. Aborting operation.')
-                    result = self._handle_authenticate_failure(user_id, conv_id)
+                    result = self._handle_authenticate_failure(user_id, conv_id, turn_id)
                 else:
                     logger.info(f'User {username} authenticated successfully.')
-                    result = self._handle_user_question(user_id, conv_id, user_prompt, skip_summary, debugging, question_msg_info)
+                    result = self._handle_user_question(user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info)
             else:
                 logger.info(f'User {username} authenticated successfully.')
-                result = self._handle_user_question(user_id, conv_id, user_prompt, skip_summary, debugging, question_msg_info)
+                result = self._handle_user_question(user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info)
         else:
             result = self._handle_empty_input()
 
@@ -134,10 +134,12 @@ def _extract_user_and_conv_id(self, question_msg_info):
         if question_msg_info is not None:
             user_id = question_msg_info.get('userId', 'unknown')
             conv_id = question_msg_info.get('convId', 'na')
+            turn_id = question_msg_info.get('turnId', 'na')
         else:
             user_id = 'unknown'
             conv_id = 'na'
-        return user_id, conv_id
+            turn_id = 'na'
+        return user_id, conv_id, turn_id
 
     def _is_feedback_only(self, user_feedback, user_prompt):
         """Return True if only feedback is provided, not a user question."""
@@ -147,10 +149,10 @@ def _is_user_question_only(self, user_feedback, user_prompt):
         """Return True if only a user question is provided, not feedback."""
         return not user_feedback and user_prompt
 
-    def _handle_feedback_only(self, user_id, conv_id):
+    def _handle_feedback_only(self, user_id, conv_id, turn_id):
         """Handle the case where only feedback is provided."""
         logger.info('User feedback provided without a user question. No operation is required.')
-        resp = self._make_skip_response(user_id, conv_id, 'feedback_ack')
+        resp = self._make_skip_response(user_id, conv_id, turn_id, 'feedback_ack')
         out_parameters = OutParameters(resp)
         return out_parameters
 
@@ -161,14 +163,14 @@ def _handle_empty_input(self):
         out_parameters = OutParameters(resp)
         return out_parameters
 
-    def _handle_authenticate_failure(self, user_id, conv_id):
+    def _handle_authenticate_failure(self, user_id, conv_id, turn_id):
         """Handle authentication failure case."""
         logger.info('User authentication failed. Aborting operation.')
-        resp = self._make_skip_response(user_id, conv_id, 'error')
+        resp = self._make_skip_response(user_id, conv_id, turn_id, 'error')
         out_parameters = OutParameters(resp)
         return out_parameters
 
-    def _handle_user_question(self, user_id, conv_id, user_prompt, skip_summary, debugging, question_msg_info):
+    def _handle_user_question(self, user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info):
         """Handle the case where only a user question is provided."""
         if user_id not in self.msg_dict:
             self.msg_dict[user_id] = deque(maxlen=HISTORY_DEPTH)
@@ -178,15 +180,19 @@ def _handle_user_question(self, user_id, conv_id, user_prompt, skip_summary, deb
         resp = self.copilot.process_turn(self.msg_dict[user_id], skip_summary, debugging)
         if not isinstance(resp, dict):
             logger.info('Unexpected response format from copilot.process_turn')
-            return self.handle_unexpected_copilot_response(user_id, conv_id)
+            return self.handle_unexpected_copilot_response(user_id, conv_id, turn_id)
         response_message_info = {
             'userId': user_id,
             'convId': conv_id,
-            'turnId': str(uuid.uuid4()),
+            'turnId': turn_id,
             'timestamp': int(datetime.now(timezone.utc).timestamp() * 1000),
             'type': 'answer',
             'timestampUnit': 'ms',
         }
+        # try:
+        #     push_frontend_meta(response_message_info)
+        # except Exception:
+        #     logger.debug('Failed to push early meta event for streaming client')
         resp['messageInfo'] = response_message_info
         debug_info = resp.get('debug')
         msg_add_kusto_resp = debug_info.get('kusto_response', None) if debug_info is not None else None
@@ -251,14 +257,14 @@ def _log_message_data(self, inout: str, parameters: Union[InParameters, OutParam
         # ingest kusto table
         self.collect_data_to_kusto(log_data)
 
-    def handle_unexpected_copilot_response(self, user_id: str, conv_id: str) -> OutParameters:
+    def handle_unexpected_copilot_response(self, user_id: str, conv_id: str, turn_id: str) -> OutParameters:
         """Handle unexpected response format from copilot agent and log error."""
         error_resp = {
             'answer': 'Internal error: unexpected response format from copilot agent.',
             'messageInfo': {
                 'userId': user_id,
                 'convId': conv_id,
-                'turnId': str(uuid.uuid4()),
+                'turnId': turn_id,
                 'timestamp': int(datetime.now(timezone.utc).timestamp() * 1000),
                 'type': 'error',
                 'timestampUnit': 'ms',
@@ -293,14 +299,14 @@ def collect_data_to_kusto(self, log_data: dict):
             logger.error(f"Exception during Kusto analytics collection: {e}", exc_info=True)
 
     @staticmethod
-    def _make_skip_response(user_id, conv_id, type_str):
+    def _make_skip_response(user_id, conv_id, turn_id, type_str):
         """Create a standard skip/error response struct for feedback or authentication failure."""
         return {
             'answer': 'skip',
             'messageInfo': {
                 'userId': user_id,
                 'convId': conv_id,
-                'turnId': str(uuid.uuid4()),
+                'turnId': turn_id,
                 'timestamp': int(datetime.now(timezone.utc).timestamp() * 1000),
                 'type': type_str,
                 'timestampUnit': 'ms',
diff --git a/src/copilot-chat/src/copilot_agent/copilot_service.py b/src/copilot-chat/src/copilot_agent/copilot_service.py
index 5eecd58b..9a33ae6a 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_service.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_service.py
@@ -7,10 +7,14 @@
 from flask import Flask, jsonify, request
 from flask_cors import CORS
 import threading
+from flask import Response, stream_with_context
+import json
+import queue
 
 from .copilot_conversation import CoPilotConversation
 
 from .utils.logger import logger
+from .utils.llmsession import LLMSession
 
 from .config import AGENT_PORT, AGENT_MODE_LOCAL
 
@@ -30,6 +34,7 @@ def __init__(self, copilot_conversation: CoPilotConversation):
         self.app = Flask(__name__)
         self.app.add_url_rule('/copilot/api/status', view_func=self.status, methods=['GET'])
         self.app.add_url_rule('/copilot/api/operation', view_func=self.instance_operation, methods=['POST'])
+        self.app.add_url_rule('/copilot/api/operation/stream', view_func=self.stream_operation, methods=['POST'])
 
         # If running in local agent mode, enable CORS to allow local testing from dev frontends.
         if AGENT_MODE_LOCAL:
@@ -59,6 +64,66 @@ def instance_operation(self):
             logger.info(f"Error handling copilot/api/operation: {e}")
             return jsonify({"status": "error", "message": str(e)}), 500
 
+    def stream_operation(self):
+        """POST endpoint to stream operation output as chunked text (SSE-like).
+
+        The endpoint accepts the same JSON payload as the normal operation endpoint.
+        It sets a module-level callback in the summary module so streaming chunks are
+        forwarded to the HTTP response. This avoids changing many internal call chains.
+        """
+        logger.info("Received request at /copilot/api/operation/stream")
+        try:
+            data = request.get_json()
+        except Exception as e:
+            logger.error(f"Failed to parse JSON body for stream_operation: {e}")
+            return jsonify({"status": "error", "message": "invalid json"}), 400
+
+        q = queue.Queue()
+
+        def on_chunk(chunk: str):
+            # put chunk into queue for streaming response
+            q.put(chunk)
+
+        def worker():
+            # set global callback on LLMSession so any call to model.stream_chat will invoke it
+            original_cb = getattr(LLMSession, '_global_stream_callback', None)
+            LLMSession.set_global_stream_callback(on_chunk)
+            try:
+                in_parameters = self.copilot_conversation.build_in_parameters(data)
+                self.copilot_conversation.perform_operation(in_parameters)
+            except Exception as e:
+                logger.error(f"Error during streaming operation worker: {e}")
+                q.put(json.dumps({'error': str(e)}))
+            finally:
+                # signal end of stream
+                q.put(None)
+                # restore original callback
+                LLMSession.set_global_stream_callback(original_cb)
+
+        def event_stream():
+            # start worker thread
+            t = threading.Thread(target=worker, daemon=True)
+            t.start()
+            # yield chunks as they arrive
+            while True:
+                item = q.get()
+                if item is None:
+                    break
+                # SSE-style framing: prefix each line with 'data:' to support multi-line payloads
+                text = str(item)
+                lines = text.splitlines()
+                if len(lines) == 0:
+                    yield "data: \n\n"
+                else:
+                    for ln in lines:
+                        yield f"data: {ln}\n"
+                    # event delimiter
+                    yield "\n"
+            # final event
+            yield "event: done\n\n"
+
+        return Response(stream_with_context(event_stream()), mimetype='text/event-stream')
+
     def run_http_server(self):
         """Start the Flask HTTP server."""
         port = AGENT_PORT
diff --git a/src/copilot-chat/src/copilot_agent/copilot_turn.py b/src/copilot-chat/src/copilot_agent/copilot_turn.py
index 2d0d9ada..79ac5405 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_turn.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_turn.py
@@ -17,6 +17,7 @@
     contextualize_question,
     gen_smart_help,
     get_prompt_from,
+    push_frontend_event,
 )
 
 
@@ -44,6 +45,7 @@ def process_turn(self, messages_list: list, skip_summary: bool = False, debuggin
             return {'category': None, 'answer': 'DEBUGGING MODE ENABLED', 'debug': {'debugging': debugging}}
 
         # get contextualized question from this and last user inquiry
+        push_frontend_event('<...understanding your inquiry...>', replace=True)
         this_inquiry = messages_list[-1]['content']
         last_inquiry = messages_list[-3]['content'] if len(messages_list) > 2 else None
         question = contextualize_question(this_inquiry, last_inquiry)
@@ -97,7 +99,7 @@ def gen_smart_help_general(self, question: str) -> str:
         system_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'gen_smart_help_prompt_general.txt'))
         if isinstance(self.help_msg, dict) and 'feature' in self.help_msg:
             system_prompt = system_prompt + '\n\n' + self.help_msg['feature']
-        summary = self.model.chat(system_prompt, f'question is: {question}')
+        summary = self.model.try_stream_fallback_chat(system_prompt, f'question is: {question}')
         return summary
 
     def get_preload_dashboard(self):
diff --git a/src/copilot-chat/src/copilot_agent/ltp/ltp.py b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
index 0805d731..bdfbbe66 100644
--- a/src/copilot-chat/src/copilot_agent/ltp/ltp.py
+++ b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
@@ -16,6 +16,7 @@
 from ..utils.openpai import execute_openpai_query
 from ..utils.powerbi import LTPReportProcessor
 from ..utils.promql import execute_promql_query, execute_promql_query_step, retrive_promql_response_value
+from ..utils.push_frontend import push_frontend_event
 from ..utils.query import gen_promql_query
 from ..utils.summary import gen_summary
 from ..utils.time import get_current_unix_timestamp
@@ -168,6 +169,7 @@ def get_brief_job_metadata(resp):
 def query_powerbi(question: str, help_msg):
     """Query PowerBI data."""
 
+    push_frontend_event('<...querying data to answer your inquiry...>', replace=True)
     query_gen_res, query_gen_status = query_generation_kql(question)
     logger.info(f'KQL Query generation result: {query_gen_res}, status: {query_gen_status}')
     k_cluster = os.environ.get('DATA_SRC_KUSTO_CLUSTER_URL', '')
@@ -202,7 +204,8 @@ def query_powerbi(question: str, help_msg):
 
     if response_status == 0:
         reference = f'\n\n >Reference: the generated KQL query used to get the data:\n\n```\n{query_gen_res}\n```'
-        summary += reference
+        # push to frontend
+        push_frontend_event(reference)
 
     info_dict = {}
     info_dict["s0_query_gen"] = {"res": query_gen_res, "status": query_gen_status}
diff --git a/src/copilot-chat/src/copilot_agent/utils/__init__.py b/src/copilot-chat/src/copilot_agent/utils/__init__.py
index 9f2b2ea6..c2bb0d88 100644
--- a/src/copilot-chat/src/copilot_agent/utils/__init__.py
+++ b/src/copilot-chat/src/copilot_agent/utils/__init__.py
@@ -31,6 +31,9 @@
 from .promql import (
     execute_promql_query,
 )
+from .push_frontend import(
+    push_frontend_event,
+)
 from .query import (
     gen_kusto_query_fallback_pseudo,
     gen_kusto_query_pseudo,
@@ -94,6 +97,7 @@
     'is_valid_json',
     'logger',
     'parse_and_align_dcw',
+    'push_frontend_event',
     'retry_function',
     'save_to_csv',
 ]
diff --git a/src/copilot-chat/src/copilot_agent/utils/llmsession.py b/src/copilot-chat/src/copilot_agent/utils/llmsession.py
index 69269937..38dac469 100644
--- a/src/copilot-chat/src/copilot_agent/utils/llmsession.py
+++ b/src/copilot-chat/src/copilot_agent/utils/llmsession.py
@@ -10,6 +10,8 @@
 
 class LLMSession:
     """A class to interact with the Azure OpenAI model."""
+    # Global stream callback that external code (server endpoint) can set
+    _global_stream_callback = None
 
     def __init__(self):
         # Env Var to set the LLM provider, accepted values are 'openai' or 'azure'
@@ -112,4 +114,131 @@ def get_embedding(self, text):
                 raise ValueError(f"Unsupported LLM provider: {self.provider}")
         except Exception as e:
             logger.error(f"Error getting embedding: {e}")
-            raise
\ No newline at end of file
+            raise
+
+    def stream_chat(self, system_prompt, user_prompt):
+        """Stream chat responses from the language model as a generator yielding text chunks.
+
+        This method works with both the OpenAI and Azure OpenAI clients used in this project.
+        It yields incremental text chunks as they arrive from the SDK's streaming API. Callers
+        can iterate over the generator to provide a streaming UX. If streaming fails after
+        retries, a single fallback message chunk will be yielded.
+        """
+        msg = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt}
+        ]
+
+        max_retries = 5
+        backoff = 2  # Initial backoff in seconds
+
+        for attempt in range(max_retries):
+            try:
+                # Start streaming from the provider. The SDK returns an iterator of events.
+                if self.provider == "azure":
+                    stream = self.model.chat.completions.create(
+                        model=self.model_name,
+                        messages=msg,
+                        max_completion_tokens=10000,
+                        stream=True
+                    )
+                elif self.provider == "openai":
+                    stream = self.model.chat.completions.create(
+                        model=self.model_name,
+                        messages=msg,
+                        max_tokens=10000,
+                        stream=True
+                    )
+                else:
+                    logger.error(f"Unsupported LLM provider in stream_chat: {self.provider}")
+                    break
+
+                # Iterate over streaming events and yield text increments.
+                # The exact shape of events can vary between SDK versions, so try a few access patterns.
+                full = ''
+                for event in stream:
+                    chunk = None
+                    try:
+                        # event may be a mapping-like object or an SDK object with attributes
+                        # Try dict-style access first
+                        if isinstance(event, dict):
+                            choices = event.get('choices')
+                        else:
+                            choices = getattr(event, 'choices', None)
+
+                        if choices:
+                            # Support both mapping and attribute access for choice and delta
+                            choice = choices[0]
+
+                            # choice might be a dict or an object
+                            if isinstance(choice, dict):
+                                delta = choice.get('delta', {})
+                                chunk = delta.get('content')
+                            else:
+                                # object-like access
+                                delta = getattr(choice, 'delta', None)
+                                if delta is not None:
+                                    # delta could be a mapping or an object
+                                    if isinstance(delta, dict):
+                                        chunk = delta.get('content')
+                                    else:
+                                        chunk = getattr(delta, 'content', None)
+                    except Exception:
+                        # Be resilient to unexpected event shapes
+                        chunk = None
+
+                    if chunk:
+                        # accumulate to reconstruct full text and yield the full snapshot
+                        full += chunk
+                        # call global callback if configured (external subscribers)
+                        try:
+                            cb = LLMSession._global_stream_callback
+                            if cb:
+                                cb(full)
+                        except Exception:
+                            logger.debug('Global stream callback failed')
+                        yield full
+
+                # If stream finishes without exception, stop generator normally
+                return
+
+            except Exception as e:
+                if "429" in str(e):
+                    logger.warning(f"429 Too Many Requests: Retrying in {backoff} seconds (Attempt {attempt + 1}/{max_retries})")
+                    time.sleep(backoff)
+                    backoff *= 2  # Exponential backoff
+                else:
+                    logger.error(f"Unexpected error in stream_chat: {e}")
+                    break
+
+        # If retries are exhausted, yield a meaningful fallback chunk so callers can display something
+        logger.error("Exceeded maximum retries for chat stream request.")
+        yield "The system is currently overloaded. Please try again later."
+
+    @classmethod
+    def set_global_stream_callback(cls, cb):
+        cls._global_stream_callback = cb
+
+    @classmethod
+    def clear_global_stream_callback(cls):
+        cls._global_stream_callback = None
+
+    def try_stream_fallback_chat(self, system_prompt: str, user_prompt: str) -> str:
+        """Try streaming the response (if a global stream callback is set) and fall back to the blocking chat call.
+
+        Returns the final aggregated text.
+        """
+        try:
+            if getattr(LLMSession, '_global_stream_callback', None):
+                logger.info('LLMSession: streaming via try_stream_fallback_chat')
+                last = ''
+                for snapshot in self.stream_chat(system_prompt, user_prompt):
+                    if snapshot:
+                        last = snapshot
+                return last
+            else:
+                logger.info('LLMSession: non-streaming via try_stream_fallback_chat')
+                return self.chat(system_prompt, user_prompt)
+        except Exception as e:
+            logger.error(f"try_stream_fallback_chat failed: {e}. Falling back to non-streaming chat.")
+            return self.chat(system_prompt, user_prompt)
\ No newline at end of file
diff --git a/src/copilot-chat/src/copilot_agent/utils/push_frontend.py b/src/copilot-chat/src/copilot_agent/utils/push_frontend.py
new file mode 100644
index 00000000..26598c2b
--- /dev/null
+++ b/src/copilot-chat/src/copilot_agent/utils/push_frontend.py
@@ -0,0 +1,27 @@
+import json
+
+from ..utils.logger import logger
+from ..utils.llmsession import LLMSession
+
+def push_frontend_event(content: str, replace: bool = False):
+    """Push an event to the frontend."""
+    # If streaming is active, push only the appended content as a JSON "append" event
+    try:
+        cb = LLMSession._global_stream_callback
+        if cb:
+            if replace:
+                cb(content)
+            else:
+                cb(json.dumps({"type": "append", "text": content}))
+    except Exception as e:
+        logger.debug(f"Failed to stream appended content: {e}")
+
+
+def push_frontend_meta(message_info: dict):
+    """Push a metadata event (messageInfo) to the frontend so client can attach turnId before answer arrives."""
+    try:
+        cb = LLMSession._global_stream_callback
+        if cb:
+            cb(json.dumps({"type": "meta", "messageInfo": message_info}))
+    except Exception as e:
+        logger.debug(f"Failed to stream meta event: {e}")
\ No newline at end of file
diff --git a/src/copilot-chat/src/copilot_agent/utils/smart_help.py b/src/copilot-chat/src/copilot_agent/utils/smart_help.py
index 9023b9fe..3560c44b 100644
--- a/src/copilot-chat/src/copilot_agent/utils/smart_help.py
+++ b/src/copilot-chat/src/copilot_agent/utils/smart_help.py
@@ -43,10 +43,10 @@ def gen_smart_help(help_msg, user_question: str, key_lst: list, SMART_HELP=True)
         question_prompt = f'[user question]\n {user_question} \n\n'
         user_prompt = question_prompt + f'[reason to generate the help]\n str{key_lst} \n\n' + capability_promp
         # send to a LLM session to generate a smart help
-        smart_help = model.chat(sys_prompt, user_prompt)
+        smart_help = model.try_stream_fallback_chat(sys_prompt, user_prompt)
         final_help = smart_help
     else:
         dump_help_prompt = f'[reason to generate the help]\n {dump_help} \n\n'
-        final_help = model.chat(sys_prompt, dump_help_prompt)
+        final_help = model.try_stream_fallback_chat(sys_prompt, dump_help_prompt)
 
     return final_help
diff --git a/src/copilot-chat/src/copilot_agent/utils/summary.py b/src/copilot-chat/src/copilot_agent/utils/summary.py
index a994fb1b..886dd94e 100644
--- a/src/copilot-chat/src/copilot_agent/utils/summary.py
+++ b/src/copilot-chat/src/copilot_agent/utils/summary.py
@@ -44,7 +44,8 @@ def gen_summary(
         sys_prompt = gen_sum_prompt + '\n\n' + knowledge_prmpt + '\n\n'
         if not skip_summary:
             logger.info('Bypass summary: False')
-            summary = model.chat(sys_prompt, user_prompt)
+            # try stream chat, if fail, fall back to chat
+            summary = model.try_stream_fallback_chat(sys_prompt, user_prompt)
         else:
             logger.info('Bypass summary: True')
             summary = handle_bypass_summary(resp_total, resp_brief)

From 4278f23902463ca0c522d3fbee36653f5c0991d5 Mon Sep 17 00:00:00 2001
From: Lei <lei.qu@microsoft.com>
Date: Fri, 12 Sep 2025 15:11:31 +0800
Subject: [PATCH 02/34] change: use frontend to assign turnId

---
 contrib/copilot-plugin/src/app/ChatBox.tsx    | 32 ++++++++++++-------
 contrib/copilot-plugin/src/libs/state.ts      | 11 +++++++
 .../src/copilot_agent/utils/logger.py         | 21 +++++++++---
 3 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/contrib/copilot-plugin/src/app/ChatBox.tsx b/contrib/copilot-plugin/src/app/ChatBox.tsx
index 1b4472c1..942e7e11 100644
--- a/contrib/copilot-plugin/src/app/ChatBox.tsx
+++ b/contrib/copilot-plugin/src/app/ChatBox.tsx
@@ -38,6 +38,17 @@ export default function ChatBox() {
     setPrompt("");
     setLoading(true);
     try {
+      // create a stable turnId and include it in the payload so server will echo/use it
+      const turnId = uuidv4();
+      const messageInfo = {
+        userId: paiuser,
+        convId: currentConversationId,
+        turnId: turnId,
+        timestamp: Math.floor(Date.now()),
+        timestampUnit: "ms",
+        type: "question",
+      };
+
       const payload = {
         async_: false,
         stream: false,
@@ -48,20 +59,14 @@ export default function ChatBox() {
             username: paiuser,
             restToken: restServerToken,
             jobToken: jobServerToken,
-            currentJob: null // currentJob ? { id: currentJob.id, name: currentJob.name, username: currentJob.username, status: currentJob.status, ip: currentJob.ip, port: currentJob.port } : null
+            currentJob: null
           },
-          messageInfo: {
-            userId: paiuser,
-            convId: currentConversationId,
-            turnId: uuidv4(),
-            timestamp: Math.floor(Date.now()),
-            timestampUnit: "ms",
-            type: "question",
-          }
+          messageInfo: messageInfo
         }
       };
-      // Create assistant placeholder and stream response into it
-      useChatStore.getState().addChat({ role: "assistant", message: "", timestamp: new Date() });
+      
+      // Create assistant placeholder and attach the same messageInfo (turnId) so feedback maps to this response
+      useChatStore.getState().addChat({ role: "assistant", message: "", timestamp: new Date(), messageInfo });
       const response = await fetch(REMOTE_SERVER_URL, {
         method: "POST",
         headers: {
@@ -114,6 +119,11 @@ export default function ChatBox() {
                   useChatStore.getState().appendToLastAssistant(parsed.text);
                   handled = true;
                 }
+                else if (parsed && parsed.type === 'meta' && parsed.messageInfo) {
+                  // attach backend-generated messageInfo (turnId etc.) to the last assistant message
+                  useChatStore.getState().setLastAssistantMessageInfo(parsed.messageInfo);
+                  handled = true;
+                }
               } catch (e) {
                 // not JSON, fall through to full replace
               }
diff --git a/contrib/copilot-plugin/src/libs/state.ts b/contrib/copilot-plugin/src/libs/state.ts
index 2a10d501..ca061649 100644
--- a/contrib/copilot-plugin/src/libs/state.ts
+++ b/contrib/copilot-plugin/src/libs/state.ts
@@ -57,6 +57,7 @@ interface State {
   addChat: (chat: ChatMessage) => void;
   appendToLastAssistant: (chunk: string) => void;
   replaceLastAssistant: (text: string) => void;
+  setLastAssistantMessageInfo: (info: any) => void;
   
   // Conversation management actions
   generateNewConversationId: () => void;
@@ -120,6 +121,16 @@ export const useChatStore = create<State>((set) => ({
     }
     return { chatMsgs: msgs };
   }),
+  setLastAssistantMessageInfo: (info: any) => set((state) => {
+    const msgs = [...state.chatMsgs];
+    for (let i = msgs.length - 1; i >= 0; i--) {
+      if (msgs[i].role === 'assistant') {
+        msgs[i] = { ...msgs[i], messageInfo: info };
+        break;
+      }
+    }
+    return { chatMsgs: msgs };
+  }),
 
   // Generate a new conversation ID (useful for starting a new conversation)
   generateNewConversationId: () => set((state) => ({ 
diff --git a/src/copilot-chat/src/copilot_agent/utils/logger.py b/src/copilot-chat/src/copilot_agent/utils/logger.py
index 48ca6f6a..abfcfead 100644
--- a/src/copilot-chat/src/copilot_agent/utils/logger.py
+++ b/src/copilot-chat/src/copilot_agent/utils/logger.py
@@ -53,12 +53,25 @@ def setup_logging():
 # Expose the root logger so all modules use the same logger instance
 class SimpleLogger:
     def info(self, msg):
-        print(f"{Fore.GREEN}[INFO]{Style.RESET_ALL} {msg}")
+        try:
+            print(f"{Fore.GREEN}[INFO]{Style.RESET_ALL} {msg}", flush=True)
+        except (BrokenPipeError, OSError):
+            # stdout/stderr closed (client disconnected); fallback to Python logging
+            logging.getLogger().info(msg)
     def error(self, msg):
-        print(f"{Fore.RED}[ERROR]{Style.RESET_ALL} {msg}")
+        try:
+            print(f"{Fore.RED}[ERROR]{Style.RESET_ALL} {msg}", flush=True)
+        except (BrokenPipeError, OSError):
+            logging.getLogger().error(msg)
     def debug(self, msg):
-        print(f"{Fore.CYAN}[DEBUG]{Style.RESET_ALL} {msg}")
+        try:
+            print(f"{Fore.CYAN}[DEBUG]{Style.RESET_ALL} {msg}", flush=True)
+        except (BrokenPipeError, OSError):
+            logging.getLogger().debug(msg)
     def warning(self, msg):
-        print(f"{Fore.YELLOW}[WARNING]{Style.RESET_ALL} {msg}")
+        try:
+            print(f"{Fore.YELLOW}[WARNING]{Style.RESET_ALL} {msg}", flush=True)
+        except (BrokenPipeError, OSError):
+            logging.getLogger().warning(msg)
 
 logger = SimpleLogger()

From 7e01649d7da625caa262ab4c1eecba79fffc43c7 Mon Sep 17 00:00:00 2001
From: Lei <lei.qu@microsoft.com>
Date: Fri, 12 Sep 2025 19:02:39 +0800
Subject: [PATCH 03/34] modify: state update help message

---
 contrib/copilot-plugin/package-lock.json      | 553 ++++++++++++++++++
 contrib/copilot-plugin/package.json           |   1 +
 .../copilot-plugin/src/app/ChatHistory.tsx    |   2 +
 .../src/copilot_agent/copilot_turn.py         |   2 +-
 src/copilot-chat/src/copilot_agent/ltp/ltp.py |   3 +-
 5 files changed, 559 insertions(+), 2 deletions(-)

diff --git a/contrib/copilot-plugin/package-lock.json b/contrib/copilot-plugin/package-lock.json
index 106ab43d..2f0b3bfa 100644
--- a/contrib/copilot-plugin/package-lock.json
+++ b/contrib/copilot-plugin/package-lock.json
@@ -57,6 +57,7 @@
         "react-icons": "^5.5.0",
         "react-markdown": "^10.1.0",
         "react-refresh": "^0.11.0",
+        "rehype-raw": "^6.1.1",
         "remark-gfm": "^4.0.1",
         "resolve": "^1.20.0",
         "resolve-url-loader": "^4.0.0",
@@ -3948,6 +3949,12 @@
       "integrity": "sha512-dISoDXWWQwUquiKsyZ4Ng+HX2KsPL7LyHKHQwgGFEA3IaKac4Obd+h2a/a6waisAoepJlBcx9paWqjA8/HVjCw==",
       "license": "MIT"
     },
+    "node_modules/@types/parse5": {
+      "version": "6.0.3",
+      "resolved": "https://registry.npmjs.org/@types/parse5/-/parse5-6.0.3.tgz",
+      "integrity": "sha512-SuT16Q1K51EAVPz1K29DJ/sXjhSQ0zjvsypYJ6tlwVsRV9jwW5Adq2ch8Dq8kDBCkYnELS7N7VNCSB5nC56t/g==",
+      "license": "MIT"
+    },
     "node_modules/@types/prettier": {
       "version": "2.7.3",
       "resolved": "https://registry.npmjs.org/@types/prettier/-/prettier-2.7.3.tgz",
@@ -9532,6 +9539,257 @@
         "node": ">= 0.4"
       }
     },
+    "node_modules/hast-util-from-parse5": {
+      "version": "7.1.2",
+      "resolved": "https://registry.npmjs.org/hast-util-from-parse5/-/hast-util-from-parse5-7.1.2.tgz",
+      "integrity": "sha512-Nz7FfPBuljzsN3tCQ4kCBKqdNhQE2l0Tn+X1ubgKBPRoiDIu1mL08Cfw4k7q71+Duyaw7DXDN+VTAp4Vh3oCOw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^2.0.0",
+        "@types/unist": "^2.0.0",
+        "hastscript": "^7.0.0",
+        "property-information": "^6.0.0",
+        "vfile": "^5.0.0",
+        "vfile-location": "^4.0.0",
+        "web-namespaces": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-from-parse5/node_modules/@types/hast": {
+      "version": "2.3.10",
+      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-2.3.10.tgz",
+      "integrity": "sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2"
+      }
+    },
+    "node_modules/hast-util-from-parse5/node_modules/@types/unist": {
+      "version": "2.0.11",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz",
+      "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==",
+      "license": "MIT"
+    },
+    "node_modules/hast-util-from-parse5/node_modules/property-information": {
+      "version": "6.5.0",
+      "resolved": "https://registry.npmjs.org/property-information/-/property-information-6.5.0.tgz",
+      "integrity": "sha512-PgTgs/BlvHxOu8QuEN7wi5A0OmXaBcHpmCSTehcs6Uuu9IkDIEo13Hy7n898RHfrQ49vKCoGeWZSaAK01nwVig==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/hast-util-from-parse5/node_modules/unist-util-stringify-position": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-3.0.3.tgz",
+      "integrity": "sha512-k5GzIBZ/QatR8N5X2y+drfpWG8IDBzdnVj6OInRNWm1oXrzydiaAT2OQiA8DPRRZyAKb9b6I2a6PxYklZD0gKg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-from-parse5/node_modules/vfile": {
+      "version": "5.3.7",
+      "resolved": "https://registry.npmjs.org/vfile/-/vfile-5.3.7.tgz",
+      "integrity": "sha512-r7qlzkgErKjobAmyNIkkSpizsFPYiUPuJb5pNW1RB4JcYVZhs4lIbVqk8XPk033CV/1z8ss5pkax8SuhGpcG8g==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "is-buffer": "^2.0.0",
+        "unist-util-stringify-position": "^3.0.0",
+        "vfile-message": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-from-parse5/node_modules/vfile-message": {
+      "version": "3.1.4",
+      "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-3.1.4.tgz",
+      "integrity": "sha512-fa0Z6P8HUrQN4BZaX05SIVXic+7kE3b05PWAtPuYP9QLHsLKYR7/AlLW3NtOrpXRLeawpDLMsVkmk5DG0NXgWw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "unist-util-stringify-position": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-parse-selector": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/hast-util-parse-selector/-/hast-util-parse-selector-3.1.1.tgz",
+      "integrity": "sha512-jdlwBjEexy1oGz0aJ2f4GKMaVKkA9jwjr4MjAAI22E5fM/TXVZHuS5OpONtdeIkRKqAaryQ2E9xNQxijoThSZA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-parse-selector/node_modules/@types/hast": {
+      "version": "2.3.10",
+      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-2.3.10.tgz",
+      "integrity": "sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2"
+      }
+    },
+    "node_modules/hast-util-parse-selector/node_modules/@types/unist": {
+      "version": "2.0.11",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz",
+      "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==",
+      "license": "MIT"
+    },
+    "node_modules/hast-util-raw": {
+      "version": "7.2.3",
+      "resolved": "https://registry.npmjs.org/hast-util-raw/-/hast-util-raw-7.2.3.tgz",
+      "integrity": "sha512-RujVQfVsOrxzPOPSzZFiwofMArbQke6DJjnFfceiEbFh7S05CbPt0cYN+A5YeD3pso0JQk6O1aHBnx9+Pm2uqg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^2.0.0",
+        "@types/parse5": "^6.0.0",
+        "hast-util-from-parse5": "^7.0.0",
+        "hast-util-to-parse5": "^7.0.0",
+        "html-void-elements": "^2.0.0",
+        "parse5": "^6.0.0",
+        "unist-util-position": "^4.0.0",
+        "unist-util-visit": "^4.0.0",
+        "vfile": "^5.0.0",
+        "web-namespaces": "^2.0.0",
+        "zwitch": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-raw/node_modules/@types/hast": {
+      "version": "2.3.10",
+      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-2.3.10.tgz",
+      "integrity": "sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2"
+      }
+    },
+    "node_modules/hast-util-raw/node_modules/@types/unist": {
+      "version": "2.0.11",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz",
+      "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==",
+      "license": "MIT"
+    },
+    "node_modules/hast-util-raw/node_modules/unist-util-is": {
+      "version": "5.2.1",
+      "resolved": "https://registry.npmjs.org/unist-util-is/-/unist-util-is-5.2.1.tgz",
+      "integrity": "sha512-u9njyyfEh43npf1M+yGKDGVPbY/JWEemg5nH05ncKPfi+kBbKBJoTdsogMu33uhytuLlv9y0O7GH7fEdwLdLQw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-raw/node_modules/unist-util-position": {
+      "version": "4.0.4",
+      "resolved": "https://registry.npmjs.org/unist-util-position/-/unist-util-position-4.0.4.tgz",
+      "integrity": "sha512-kUBE91efOWfIVBo8xzh/uZQ7p9ffYRtUbMRZBNFYwf0RK8koUMx6dGUfwylLOKmaT2cs4wSW96QoYUSXAyEtpg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-raw/node_modules/unist-util-stringify-position": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-3.0.3.tgz",
+      "integrity": "sha512-k5GzIBZ/QatR8N5X2y+drfpWG8IDBzdnVj6OInRNWm1oXrzydiaAT2OQiA8DPRRZyAKb9b6I2a6PxYklZD0gKg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-raw/node_modules/unist-util-visit": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/unist-util-visit/-/unist-util-visit-4.1.2.tgz",
+      "integrity": "sha512-MSd8OUGISqHdVvfY9TPhyK2VdUrPgxkUtWSuMHF6XAAFuL4LokseigBnZtPnJMu+FbynTkFNnFlyjxpVKujMRg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "unist-util-is": "^5.0.0",
+        "unist-util-visit-parents": "^5.1.1"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-raw/node_modules/unist-util-visit-parents": {
+      "version": "5.1.3",
+      "resolved": "https://registry.npmjs.org/unist-util-visit-parents/-/unist-util-visit-parents-5.1.3.tgz",
+      "integrity": "sha512-x6+y8g7wWMyQhL1iZfhIPhDAs7Xwbn9nRosDXl7qoPTSCy0yNxnKc+hWokFifWQIDGi154rdUqKvbCa4+1kLhg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "unist-util-is": "^5.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-raw/node_modules/vfile": {
+      "version": "5.3.7",
+      "resolved": "https://registry.npmjs.org/vfile/-/vfile-5.3.7.tgz",
+      "integrity": "sha512-r7qlzkgErKjobAmyNIkkSpizsFPYiUPuJb5pNW1RB4JcYVZhs4lIbVqk8XPk033CV/1z8ss5pkax8SuhGpcG8g==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "is-buffer": "^2.0.0",
+        "unist-util-stringify-position": "^3.0.0",
+        "vfile-message": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-raw/node_modules/vfile-message": {
+      "version": "3.1.4",
+      "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-3.1.4.tgz",
+      "integrity": "sha512-fa0Z6P8HUrQN4BZaX05SIVXic+7kE3b05PWAtPuYP9QLHsLKYR7/AlLW3NtOrpXRLeawpDLMsVkmk5DG0NXgWw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "unist-util-stringify-position": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
     "node_modules/hast-util-to-jsx-runtime": {
       "version": "2.3.6",
       "resolved": "https://registry.npmjs.org/hast-util-to-jsx-runtime/-/hast-util-to-jsx-runtime-2.3.6.tgz",
@@ -9559,6 +9817,49 @@
         "url": "https://opencollective.com/unified"
       }
     },
+    "node_modules/hast-util-to-parse5": {
+      "version": "7.1.0",
+      "resolved": "https://registry.npmjs.org/hast-util-to-parse5/-/hast-util-to-parse5-7.1.0.tgz",
+      "integrity": "sha512-YNRgAJkH2Jky5ySkIqFXTQiaqcAtJyVE+D5lkN6CdtOqrnkLfGYYrEcKuHOJZlp+MwjSwuD3fZuawI+sic/RBw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^2.0.0",
+        "comma-separated-tokens": "^2.0.0",
+        "property-information": "^6.0.0",
+        "space-separated-tokens": "^2.0.0",
+        "web-namespaces": "^2.0.0",
+        "zwitch": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-to-parse5/node_modules/@types/hast": {
+      "version": "2.3.10",
+      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-2.3.10.tgz",
+      "integrity": "sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2"
+      }
+    },
+    "node_modules/hast-util-to-parse5/node_modules/@types/unist": {
+      "version": "2.0.11",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz",
+      "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==",
+      "license": "MIT"
+    },
+    "node_modules/hast-util-to-parse5/node_modules/property-information": {
+      "version": "6.5.0",
+      "resolved": "https://registry.npmjs.org/property-information/-/property-information-6.5.0.tgz",
+      "integrity": "sha512-PgTgs/BlvHxOu8QuEN7wi5A0OmXaBcHpmCSTehcs6Uuu9IkDIEo13Hy7n898RHfrQ49vKCoGeWZSaAK01nwVig==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
     "node_modules/hast-util-whitespace": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/hast-util-whitespace/-/hast-util-whitespace-3.0.0.tgz",
@@ -9572,6 +9873,48 @@
         "url": "https://opencollective.com/unified"
       }
     },
+    "node_modules/hastscript": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/hastscript/-/hastscript-7.2.0.tgz",
+      "integrity": "sha512-TtYPq24IldU8iKoJQqvZOuhi5CyCQRAbvDOX0x1eW6rsHSxa/1i2CCiptNTotGHJ3VoHRGmqiv6/D3q113ikkw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^2.0.0",
+        "comma-separated-tokens": "^2.0.0",
+        "hast-util-parse-selector": "^3.0.0",
+        "property-information": "^6.0.0",
+        "space-separated-tokens": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hastscript/node_modules/@types/hast": {
+      "version": "2.3.10",
+      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-2.3.10.tgz",
+      "integrity": "sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2"
+      }
+    },
+    "node_modules/hastscript/node_modules/@types/unist": {
+      "version": "2.0.11",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz",
+      "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==",
+      "license": "MIT"
+    },
+    "node_modules/hastscript/node_modules/property-information": {
+      "version": "6.5.0",
+      "resolved": "https://registry.npmjs.org/property-information/-/property-information-6.5.0.tgz",
+      "integrity": "sha512-PgTgs/BlvHxOu8QuEN7wi5A0OmXaBcHpmCSTehcs6Uuu9IkDIEo13Hy7n898RHfrQ49vKCoGeWZSaAK01nwVig==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
     "node_modules/he": {
       "version": "1.2.0",
       "resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz",
@@ -9714,6 +10057,16 @@
         "url": "https://opencollective.com/unified"
       }
     },
+    "node_modules/html-void-elements": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/html-void-elements/-/html-void-elements-2.0.1.tgz",
+      "integrity": "sha512-0quDb7s97CfemeJAnW9wC0hw78MtW7NU3hqtCD75g2vFlDLt36llsYD7uB7SUzojLMP24N5IatXf7ylGXiGG9A==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
     "node_modules/html-webpack-plugin": {
       "version": "5.6.3",
       "resolved": "https://registry.npmjs.org/html-webpack-plugin/-/html-webpack-plugin-5.6.3.tgz",
@@ -10151,6 +10504,29 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
+    "node_modules/is-buffer": {
+      "version": "2.0.5",
+      "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-2.0.5.tgz",
+      "integrity": "sha512-i2R6zNFDwgEHJyQUtJEk0XFi1i0dPFn/oqjK3/vPCcDeJvW5NQ83V8QbicfF1SupOaB0h8ntgBC2YiE7dfyctQ==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ],
+      "license": "MIT",
+      "engines": {
+        "node": ">=4"
+      }
+    },
     "node_modules/is-callable": {
       "version": "1.2.7",
       "resolved": "https://registry.npmjs.org/is-callable/-/is-callable-1.2.7.tgz",
@@ -16004,6 +16380,110 @@
         "node": ">=6"
       }
     },
+    "node_modules/rehype-raw": {
+      "version": "6.1.1",
+      "resolved": "https://registry.npmjs.org/rehype-raw/-/rehype-raw-6.1.1.tgz",
+      "integrity": "sha512-d6AKtisSRtDRX4aSPsJGTfnzrX2ZkHQLE5kiUuGOeEoLpbEulFF4hj0mLPbsa+7vmguDKOVVEQdHKDSwoaIDsQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^2.0.0",
+        "hast-util-raw": "^7.2.0",
+        "unified": "^10.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/rehype-raw/node_modules/@types/hast": {
+      "version": "2.3.10",
+      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-2.3.10.tgz",
+      "integrity": "sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2"
+      }
+    },
+    "node_modules/rehype-raw/node_modules/@types/unist": {
+      "version": "2.0.11",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz",
+      "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==",
+      "license": "MIT"
+    },
+    "node_modules/rehype-raw/node_modules/is-plain-obj": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-4.1.0.tgz",
+      "integrity": "sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/rehype-raw/node_modules/unified": {
+      "version": "10.1.2",
+      "resolved": "https://registry.npmjs.org/unified/-/unified-10.1.2.tgz",
+      "integrity": "sha512-pUSWAi/RAnVy1Pif2kAoeWNBa3JVrx0MId2LASj8G+7AiHWoKZNTomq6LG326T68U7/e263X6fTdcXIy7XnF7Q==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "bail": "^2.0.0",
+        "extend": "^3.0.0",
+        "is-buffer": "^2.0.0",
+        "is-plain-obj": "^4.0.0",
+        "trough": "^2.0.0",
+        "vfile": "^5.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/rehype-raw/node_modules/unist-util-stringify-position": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-3.0.3.tgz",
+      "integrity": "sha512-k5GzIBZ/QatR8N5X2y+drfpWG8IDBzdnVj6OInRNWm1oXrzydiaAT2OQiA8DPRRZyAKb9b6I2a6PxYklZD0gKg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/rehype-raw/node_modules/vfile": {
+      "version": "5.3.7",
+      "resolved": "https://registry.npmjs.org/vfile/-/vfile-5.3.7.tgz",
+      "integrity": "sha512-r7qlzkgErKjobAmyNIkkSpizsFPYiUPuJb5pNW1RB4JcYVZhs4lIbVqk8XPk033CV/1z8ss5pkax8SuhGpcG8g==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "is-buffer": "^2.0.0",
+        "unist-util-stringify-position": "^3.0.0",
+        "vfile-message": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/rehype-raw/node_modules/vfile-message": {
+      "version": "3.1.4",
+      "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-3.1.4.tgz",
+      "integrity": "sha512-fa0Z6P8HUrQN4BZaX05SIVXic+7kE3b05PWAtPuYP9QLHsLKYR7/AlLW3NtOrpXRLeawpDLMsVkmk5DG0NXgWw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "unist-util-stringify-position": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
     "node_modules/relateurl": {
       "version": "0.2.7",
       "resolved": "https://registry.npmjs.org/relateurl/-/relateurl-0.2.7.tgz",
@@ -18816,6 +19296,69 @@
         "url": "https://opencollective.com/unified"
       }
     },
+    "node_modules/vfile-location": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/vfile-location/-/vfile-location-4.1.0.tgz",
+      "integrity": "sha512-YF23YMyASIIJXpktBa4vIGLJ5Gs88UB/XePgqPmTa7cDA+JeO3yclbpheQYCHjVHBn/yePzrXuygIL+xbvRYHw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "vfile": "^5.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/vfile-location/node_modules/@types/unist": {
+      "version": "2.0.11",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz",
+      "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==",
+      "license": "MIT"
+    },
+    "node_modules/vfile-location/node_modules/unist-util-stringify-position": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-3.0.3.tgz",
+      "integrity": "sha512-k5GzIBZ/QatR8N5X2y+drfpWG8IDBzdnVj6OInRNWm1oXrzydiaAT2OQiA8DPRRZyAKb9b6I2a6PxYklZD0gKg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/vfile-location/node_modules/vfile": {
+      "version": "5.3.7",
+      "resolved": "https://registry.npmjs.org/vfile/-/vfile-5.3.7.tgz",
+      "integrity": "sha512-r7qlzkgErKjobAmyNIkkSpizsFPYiUPuJb5pNW1RB4JcYVZhs4lIbVqk8XPk033CV/1z8ss5pkax8SuhGpcG8g==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "is-buffer": "^2.0.0",
+        "unist-util-stringify-position": "^3.0.0",
+        "vfile-message": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/vfile-location/node_modules/vfile-message": {
+      "version": "3.1.4",
+      "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-3.1.4.tgz",
+      "integrity": "sha512-fa0Z6P8HUrQN4BZaX05SIVXic+7kE3b05PWAtPuYP9QLHsLKYR7/AlLW3NtOrpXRLeawpDLMsVkmk5DG0NXgWw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "unist-util-stringify-position": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
     "node_modules/vfile-message": {
       "version": "4.0.2",
       "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-4.0.2.tgz",
@@ -18883,6 +19426,16 @@
         "minimalistic-assert": "^1.0.0"
       }
     },
+    "node_modules/web-namespaces": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/web-namespaces/-/web-namespaces-2.0.1.tgz",
+      "integrity": "sha512-bKr1DkiNa2krS7qxNtdrtHAmzuYGFQLiQ13TsorsdT6ULTkPLKuu5+GsFpDlg6JFjUTwX2DyhMPG2be8uPrqsQ==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
     "node_modules/web-vitals": {
       "version": "2.1.4",
       "resolved": "https://registry.npmjs.org/web-vitals/-/web-vitals-2.1.4.tgz",
diff --git a/contrib/copilot-plugin/package.json b/contrib/copilot-plugin/package.json
index 1008a6b0..7de6c3b3 100644
--- a/contrib/copilot-plugin/package.json
+++ b/contrib/copilot-plugin/package.json
@@ -51,6 +51,7 @@
     "react-dom": "^18.3.1",
     "react-icons": "^5.5.0",
     "react-markdown": "^10.1.0",
+    "rehype-raw": "^6.1.1",
     "react-refresh": "^0.11.0",
     "remark-gfm": "^4.0.1",
     "resolve": "^1.20.0",
diff --git a/contrib/copilot-plugin/src/app/ChatHistory.tsx b/contrib/copilot-plugin/src/app/ChatHistory.tsx
index df606311..f9bec61b 100644
--- a/contrib/copilot-plugin/src/app/ChatHistory.tsx
+++ b/contrib/copilot-plugin/src/app/ChatHistory.tsx
@@ -11,6 +11,7 @@ import { Bot, User, ThumbsUp, ThumbsDown } from "lucide-react";
 import Markdown, { Components } from "react-markdown";
 
 import remarkGfm from "remark-gfm";
+import rehypeRaw from 'rehype-raw';
 import { ChatMessage, useChatStore } from "../libs/state";
 import { Pane } from "../components/pane";
 
@@ -101,6 +102,7 @@ const CustomMarkdown: React.FC<{ content: string }> = ({ content }) => {
     <div className={`prose-sm text-base break-words word-wrap`}>
       <Markdown
         remarkPlugins={[remarkGfm]}
+        rehypePlugins={[rehypeRaw as any]}
         components={{
           pre({ node, ...props }: any) {
             return <PreWithLineNumbers>{props.children}</PreWithLineNumbers>;
diff --git a/src/copilot-chat/src/copilot_agent/copilot_turn.py b/src/copilot-chat/src/copilot_agent/copilot_turn.py
index 79ac5405..bf7b23ad 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_turn.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_turn.py
@@ -45,7 +45,7 @@ def process_turn(self, messages_list: list, skip_summary: bool = False, debuggin
             return {'category': None, 'answer': 'DEBUGGING MODE ENABLED', 'debug': {'debugging': debugging}}
 
         # get contextualized question from this and last user inquiry
-        push_frontend_event('<...understanding your inquiry...>', replace=True)
+        push_frontend_event('<span class="text-gray-400 italic">🤔 Copilot is understanding your inquiry...</span><br/>', replace=False)
         this_inquiry = messages_list[-1]['content']
         last_inquiry = messages_list[-3]['content'] if len(messages_list) > 2 else None
         question = contextualize_question(this_inquiry, last_inquiry)
diff --git a/src/copilot-chat/src/copilot_agent/ltp/ltp.py b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
index bdfbbe66..38b806bf 100644
--- a/src/copilot-chat/src/copilot_agent/ltp/ltp.py
+++ b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
@@ -169,7 +169,8 @@ def get_brief_job_metadata(resp):
 def query_powerbi(question: str, help_msg):
     """Query PowerBI data."""
 
-    push_frontend_event('<...querying data to answer your inquiry...>', replace=True)
+    # send HTML snippet so frontend (with rehype-raw enabled) can render with Tailwind styling
+    push_frontend_event('<span class="text-gray-400 italic">🔍 Copilot is gathering data to answer your inquiry...</span><br/>', replace=False)
     query_gen_res, query_gen_status = query_generation_kql(question)
     logger.info(f'KQL Query generation result: {query_gen_res}, status: {query_gen_status}')
     k_cluster = os.environ.get('DATA_SRC_KUSTO_CLUSTER_URL', '')

From 434d8ec7d570bcabdc002d19e90024b74719a02e Mon Sep 17 00:00:00 2001
From: Lei <lei.qu@microsoft.com>
Date: Fri, 12 Sep 2025 20:23:30 +0800
Subject: [PATCH 04/34] improve: auto scroll

---
 contrib/copilot-plugin/src/app/ChatBox.tsx    |  24 +++-
 .../copilot-plugin/src/app/ChatHistory.tsx    | 130 +++++++++++++++++-
 .../copilot-plugin/src/components/pane.tsx    |  29 ++--
 3 files changed, 162 insertions(+), 21 deletions(-)

diff --git a/contrib/copilot-plugin/src/app/ChatBox.tsx b/contrib/copilot-plugin/src/app/ChatBox.tsx
index 942e7e11..8f7d9270 100644
--- a/contrib/copilot-plugin/src/app/ChatBox.tsx
+++ b/contrib/copilot-plugin/src/app/ChatBox.tsx
@@ -130,9 +130,27 @@ export default function ChatBox() {
             }
 
             if (!handled) {
-              // Replace the last assistant message with the full reconstructed text
-              useChatStore.getState().replaceLastAssistant(dataStr);
-            }
+              // If server sent a full snapshot repeatedly (common when backend doesn't send structured append events),
+              // detect the already-displayed prefix and append only the new suffix. This avoids blinking and missing lines
+              // during rapid streaming of many list items.
+              const store = useChatStore.getState();
+              const msgs = store.chatMsgs;
+              let lastAssistant = "";
+              for (let i = msgs.length - 1; i >= 0; i--) {
+                if (msgs[i].role === 'assistant') {
+                  lastAssistant = msgs[i].message || '';
+                  break;
+                }
+              }
+
+              if (lastAssistant && dataStr.startsWith(lastAssistant)) {
+                const suffix = dataStr.slice(lastAssistant.length);
+                if (suffix.length > 0) store.appendToLastAssistant(suffix);
+              } else {
+                // Fallback: replace the last assistant message with the full reconstructed text
+                store.replaceLastAssistant(dataStr);
+              }
+             }
           }
 
           if (isDoneEvent) {
diff --git a/contrib/copilot-plugin/src/app/ChatHistory.tsx b/contrib/copilot-plugin/src/app/ChatHistory.tsx
index f9bec61b..76c94c1c 100644
--- a/contrib/copilot-plugin/src/app/ChatHistory.tsx
+++ b/contrib/copilot-plugin/src/app/ChatHistory.tsx
@@ -300,13 +300,131 @@ const GroupedChatMessages: React.FC = () => {
   const messages = useChatStore((state) => state.chatMsgs);
   const scrollRef = useRef<HTMLDivElement>(null);
 
-  useEffect(() => {
-    if (scrollRef.current) {
-      scrollRef.current.scrollTop = scrollRef.current.scrollHeight;
-    }
-  }, [messages]);
-
+  // compute grouped messages and helper values early so effects can reference them
   const groupedMessages = groupMessages(messages);
+  const lastText = groupedMessages.length
+    ? groupedMessages[groupedMessages.length - 1].messages.map((m) => m.message).join('\n')
+    : ''
+  const NEAR_BOTTOM_THRESHOLD = 120
+
+   // Reliable auto-scroll for both new messages and streaming updates.
+   const prevCountRef = React.useRef<number>(0);
+   const lastTextRef = React.useRef<string>('');
+
+   // Scroll helper: find nearest scrollable element that actually overflows, otherwise fallback to window
+   const scrollToBottom = (startTarget?: HTMLElement | null) => {
+     const startEl = startTarget || scrollRef.current
+     if (!startEl) return
+     let cur: HTMLElement | null = startEl
+     while (cur && cur !== document.body) {
+       try {
+         const style = window.getComputedStyle(cur)
+         const overflowY = style.overflowY
+         if ((overflowY === 'auto' || overflowY === 'scroll' || overflowY === 'overlay') && cur.scrollHeight > cur.clientHeight) {
+           cur.scrollTop = cur.scrollHeight
+           console.debug('[ChatHistory] scrollToBottom scrolled element', cur.tagName, { newScrollTop: cur.scrollTop })
+           return
+         }
+       } catch (e) {
+         // ignore
+       }
+       cur = cur.parentElement
+     }
+
+     // fallback to window/document
+     try {
+       window.scrollTo(0, document.documentElement.scrollHeight)
+       console.debug('[ChatHistory] scrollToBottom scrolled window', document.documentElement.scrollHeight)
+     } catch (e) {
+       console.debug('[ChatHistory] scrollToBottom window scroll failed', e)
+     }
+   }
+
+   useEffect(() => {
+     const el = scrollRef.current
+     if (!el) return
+
+     // Debug: log scroll metrics on message updates
+     try {
+       const now = Date.now()
+       const scrollHeight = el.scrollHeight
+       const clientHeight = el.clientHeight
+       const scrollTop = el.scrollTop
+       const distanceFromBottom = scrollHeight - scrollTop - clientHeight
+       console.debug(`[ChatHistory] messages changed @${now} count=${groupedMessages.length}`, {
+         scrollHeight,
+         clientHeight,
+         scrollTop,
+         distanceFromBottom,
+         prevCount: prevCountRef.current,
+         lastText: lastTextRef.current?.slice(0, 200) || '(none)'
+       })
+     } catch (err) {
+       console.debug('[ChatHistory] debug log failed', err)
+     }
+
+     const distanceFromBottom = el.scrollHeight - el.scrollTop - el.clientHeight
+     const shouldScroll =
+       distanceFromBottom < NEAR_BOTTOM_THRESHOLD &&
+       (prevCountRef.current != groupedMessages.length || lastTextRef.current != lastText)
+
+     if (shouldScroll) {
+       console.debug('[ChatHistory] will auto-scroll (requestAnimationFrame)')
+       // try smooth scrolling in next frame
+       requestAnimationFrame(() => {
+         try {
+           scrollToBottom(el)
+           console.debug('[ChatHistory] did auto-scroll (requestAnimationFrame)')
+         } catch (e) {
+           console.debug('[ChatHistory] auto-scroll (RAF) failed', e)
+         }
+       })
+
+       // fallback: ensure scroll after a short delay
+       setTimeout(() => {
+         try {
+           scrollToBottom(el)
+           console.debug('[ChatHistory] did auto-scroll (timeout)')
+         } catch (e) {
+           console.debug('[ChatHistory] auto-scroll (timeout) failed', e)
+         }
+       }, 120)
+     }
+
+     prevCountRef.current = groupedMessages.length
+     lastTextRef.current = lastText
+   }, [groupedMessages.length, lastText])
+
+   // observe DOM changes to catch streaming incremental updates
+   useEffect(() => {
+     const el = scrollRef.current
+     if (!el) return
+
+     const observer = new MutationObserver((mutations) => {
+       try {
+         console.debug('[ChatHistory] MutationObserver triggered, mutations:', mutations.length)
+         const distanceFromBottom = el.scrollHeight - el.scrollTop - el.clientHeight
+         console.debug('[ChatHistory] MutationObserver metrics', { scrollTop: el.scrollTop, scrollHeight: el.scrollHeight, clientHeight: el.clientHeight, distanceFromBottom })
+         if (distanceFromBottom < NEAR_BOTTOM_THRESHOLD) {
+           console.debug('[ChatHistory] MutationObserver will auto-scroll (RAF)')
+           requestAnimationFrame(() => {
+             try {
+               scrollToBottom(el)
+               console.debug('[ChatHistory] MutationObserver did auto-scroll')
+             } catch (e) {
+               console.debug('[ChatHistory] MutationObserver auto-scroll failed', e)
+             }
+           })
+         }
+       } catch (e) {
+         console.debug('[ChatHistory] MutationObserver callback error', e)
+       }
+     })
+
+     observer.observe(el, { childList: true, subtree: true, characterData: true })
+     return () => observer.disconnect()
+   }, [scrollRef]);
+
   return (
     <Pane className="p-0">
       <div className="bg-white top-0 sticky p-2 px-4 pb-2 border-b text-sm flex items-center gap-1">
diff --git a/contrib/copilot-plugin/src/components/pane.tsx b/contrib/copilot-plugin/src/components/pane.tsx
index 34ca7e7c..53bc8dbc 100644
--- a/contrib/copilot-plugin/src/components/pane.tsx
+++ b/contrib/copilot-plugin/src/components/pane.tsx
@@ -1,19 +1,24 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+import React from "react";
 import { cn } from "../libs/utils";
 
 interface PaneProps extends React.HTMLAttributes<HTMLDivElement> {}
 
-export const Pane: React.FC<PaneProps> = ({ children, className }) => {
-  return (
-    <div
-      className={cn(
-        "bg-background flex-1 p-4 border-2 border-gray-300 rounded-md overflow-y-auto flex flex-col",
-        className
-      )}
-    >
-      {children}
-    </div>
-  );
-};
+export const Pane = React.forwardRef<HTMLDivElement, PaneProps>(
+  ({ children, className }, ref) => {
+    return (
+      <div
+        ref={ref}
+        className={cn(
+          "bg-background flex-1 p-4 border-2 border-gray-300 rounded-md overflow-y-auto flex flex-col",
+          className
+        )}
+      >
+        {children}
+      </div>
+    );
+  }
+);
+Pane.displayName = "Pane";

From b68c194f781ec079fd46cd456b58908e46575765 Mon Sep 17 00:00:00 2001
From: Lei <lei.qu@microsoft.com>
Date: Fri, 12 Sep 2025 20:37:46 +0800
Subject: [PATCH 05/34] adding more status update message

---
 src/copilot-chat/src/copilot_agent/copilot_turn.py | 3 ++-
 src/copilot-chat/src/copilot_agent/ltp/ltp.py      | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/copilot-chat/src/copilot_agent/copilot_turn.py b/src/copilot-chat/src/copilot_agent/copilot_turn.py
index bf7b23ad..018d3246 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_turn.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_turn.py
@@ -45,12 +45,13 @@ def process_turn(self, messages_list: list, skip_summary: bool = False, debuggin
             return {'category': None, 'answer': 'DEBUGGING MODE ENABLED', 'debug': {'debugging': debugging}}
 
         # get contextualized question from this and last user inquiry
-        push_frontend_event('<span class="text-gray-400 italic">🤔 Copilot is understanding your inquiry...</span><br/>', replace=False)
+        push_frontend_event('<span class="text-gray-400 italic">🤔 Copilot is understanding your request...</span><br/>', replace=False)
         this_inquiry = messages_list[-1]['content']
         last_inquiry = messages_list[-3]['content'] if len(messages_list) > 2 else None
         question = contextualize_question(this_inquiry, last_inquiry)
 
         # classify the question to determine the solution source and method
+        push_frontend_event('<span class="text-gray-400 italic">🔍 Copilot is finding the right the data source...</span><br/>', replace=False)
         question_type = self.classifier.classify_question(question)
         # objective, concern in the question
         obj, con = question_type.get('lv0_object', '3. [general]'), question_type.get('lv1_concern', '0. [others]')
diff --git a/src/copilot-chat/src/copilot_agent/ltp/ltp.py b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
index 38b806bf..c349a454 100644
--- a/src/copilot-chat/src/copilot_agent/ltp/ltp.py
+++ b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
@@ -170,13 +170,14 @@ def query_powerbi(question: str, help_msg):
     """Query PowerBI data."""
 
     # send HTML snippet so frontend (with rehype-raw enabled) can render with Tailwind styling
-    push_frontend_event('<span class="text-gray-400 italic">🔍 Copilot is gathering data to answer your inquiry...</span><br/>', replace=False)
+    push_frontend_event('<span class="text-gray-400 italic">✍️ Copilot is crafting the query...</span><br/>', replace=False)
     query_gen_res, query_gen_status = query_generation_kql(question)
     logger.info(f'KQL Query generation result: {query_gen_res}, status: {query_gen_status}')
     k_cluster = os.environ.get('DATA_SRC_KUSTO_CLUSTER_URL', '')
     k_db = os.environ.get('DATA_SRC_KUSTO_DATABASE_NAME', '')
     k_table = ''
     if query_gen_status == 0:
+        push_frontend_event('<span class="text-gray-400 italic">📥 Copilot is collecting the information...</span><br/>', replace=False)
         KQL = KustoExecutor(k_cluster, k_db, k_table)
         # Replace placeholders
         query_gen_res = query_gen_res.format(

From 1adf126e9f4f524c74b2b8800690c60e3266bb9e Mon Sep 17 00:00:00 2001
From: Lei <lei.qu@microsoft.com>
Date: Wed, 17 Sep 2025 18:43:07 +0800
Subject: [PATCH 06/34] minor: add new status update message

---
 src/copilot-chat/src/copilot_agent/copilot_turn.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/copilot-chat/src/copilot_agent/copilot_turn.py b/src/copilot-chat/src/copilot_agent/copilot_turn.py
index 018d3246..388221b4 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_turn.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_turn.py
@@ -100,6 +100,7 @@ def gen_smart_help_general(self, question: str) -> str:
         system_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'gen_smart_help_prompt_general.txt'))
         if isinstance(self.help_msg, dict) and 'feature' in self.help_msg:
             system_prompt = system_prompt + '\n\n' + self.help_msg['feature']
+        push_frontend_event('<span class="text-gray-400 italic">🌐 Accessing public information...</span><br/>', replace=False)
         summary = self.model.try_stream_fallback_chat(system_prompt, f'question is: {question}')
         return summary
 

From 92d28690b556e2f8b00ca1cd2b6ea0659473cd38 Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Fri, 10 Oct 2025 16:31:26 -0700
Subject: [PATCH 07/34] Add feature: multi user concurrency

---
 .../src/copilot_agent/copilot_conversation.py | 11 +--
 .../src/copilot_agent/copilot_service.py      | 16 ++--
 .../src/copilot_agent/copilot_turn.py         | 49 +++++------
 src/copilot-chat/src/copilot_agent/ltp/ltp.py | 19 +++--
 .../src/copilot_agent/utils/authentication.py | 68 +++++++++-------
 .../src/copilot_agent/utils/llmsession.py     | 81 ++++++++++++++-----
 .../src/copilot_agent/utils/push_frontend.py  | 33 +++++++-
 .../src/copilot_agent/utils/smart_help.py     |  7 +-
 .../src/copilot_agent/utils/summary.py        |  7 +-
 9 files changed, 189 insertions(+), 102 deletions(-)

diff --git a/src/copilot-chat/src/copilot_agent/copilot_conversation.py b/src/copilot-chat/src/copilot_agent/copilot_conversation.py
index 4cffbde7..6d6d68c2 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_conversation.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_conversation.py
@@ -17,6 +17,7 @@
 
 from .config import AGENT_MODE_LOCAL, print_env_variables
 from .copilot_turn import CoPilotTurn
+from .utils.llmsession import LLMSession
 
 HISTORY_DEPTH = int(os.getenv('COPILOT_HISTORY_DEPTH', 64))
 if HISTORY_DEPTH <= 0:
@@ -78,7 +79,7 @@ def _log_message_history(self) -> None:
         for user_id, messages in self.msg_dict.items():
             logger.info(f'[internal control word] [msg_dict audit]: user "{user_id}" msg_list length is {len(messages)}')
 
-    def perform_operation(self, in_parameters: InParameters) -> OutParameters:
+    def perform_operation(self, in_parameters: InParameters, llm_session: LLMSession) -> OutParameters:
         """Main entry for performing an operation. Delegates to helpers for clarity."""
         logger.info('[CoPilot]: New Chat Round Started')
         user_prompt = in_parameters.user
@@ -108,10 +109,10 @@ def perform_operation(self, in_parameters: InParameters) -> OutParameters:
                     result = self._handle_authenticate_failure(user_id, conv_id, turn_id)
                 else:
                     logger.info(f'User {username} authenticated successfully.')
-                    result = self._handle_user_question(user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info)
+                    result = self._handle_user_question(user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info, llm_session)
             else:
                 logger.info(f'User {username} authenticated successfully.')
-                result = self._handle_user_question(user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info)
+                result = self._handle_user_question(user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info, llm_session)
         else:
             result = self._handle_empty_input()
 
@@ -170,14 +171,14 @@ def _handle_authenticate_failure(self, user_id, conv_id, turn_id):
         out_parameters = OutParameters(resp)
         return out_parameters
 
-    def _handle_user_question(self, user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info):
+    def _handle_user_question(self, user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info, llm_session: LLMSession):
         """Handle the case where only a user question is provided."""
         if user_id not in self.msg_dict:
             self.msg_dict[user_id] = deque(maxlen=HISTORY_DEPTH)
         msg_user = {'role': 'user', 'content': user_prompt}
         self.manage_conv_history(user_id, msg_user)
         logger.info(f'[internal control word] [per user check] user "{user_id}" msg_list length is {len(self.msg_dict[user_id])}')
-        resp = self.copilot.process_turn(self.msg_dict[user_id], skip_summary, debugging)
+        resp = self.copilot.process_turn(self.msg_dict[user_id], skip_summary, debugging, llm_session)
         if not isinstance(resp, dict):
             logger.info('Unexpected response format from copilot.process_turn')
             return self.handle_unexpected_copilot_response(user_id, conv_id, turn_id)
diff --git a/src/copilot-chat/src/copilot_agent/copilot_service.py b/src/copilot-chat/src/copilot_agent/copilot_service.py
index 9a33ae6a..be7bae13 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_service.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_service.py
@@ -85,20 +85,24 @@ def on_chunk(chunk: str):
             q.put(chunk)
 
         def worker():
-            # set global callback on LLMSession so any call to model.stream_chat will invoke it
-            original_cb = getattr(LLMSession, '_global_stream_callback', None)
-            LLMSession.set_global_stream_callback(on_chunk)
+            # Create a dedicated LLM session for this request with per-instance callback
+            # This eliminates the global callback race condition that causes cross-user contamination
             try:
                 in_parameters = self.copilot_conversation.build_in_parameters(data)
-                self.copilot_conversation.perform_operation(in_parameters)
+                # Create a fresh LLM session for this streaming request
+                llm_session = LLMSession()
+                llm_session.set_instance_stream_callback(on_chunk)
+                
+                # Pass the dedicated session to the conversation
+                result = self.copilot_conversation.perform_operation(in_parameters, llm_session=llm_session)
             except Exception as e:
                 logger.error(f"Error during streaming operation worker: {e}")
+                import traceback
+                logger.error(traceback.format_exc())
                 q.put(json.dumps({'error': str(e)}))
             finally:
                 # signal end of stream
                 q.put(None)
-                # restore original callback
-                LLMSession.set_global_stream_callback(original_cb)
 
         def event_stream():
             # start worker thread
diff --git a/src/copilot-chat/src/copilot_agent/copilot_turn.py b/src/copilot-chat/src/copilot_agent/copilot_turn.py
index 388221b4..eea8dffb 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_turn.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_turn.py
@@ -24,26 +24,27 @@
 class CoPilotTurn:
     """CoPilot Turn, handles each inquiry/response turn."""
 
-    def __init__(self, model: LLMSession = None, verbose: bool = False) -> None:
+    def __init__(self, verbose: bool = False) -> None:
         """Initialize."""
-        if model is None:
-            model = LLMSession()
-        self.model = model
         self.verbose = verbose
         # Initialize version
         self._version = self._initialize_version()
         # Load help message
         self.help_msg = self.load_help_message()
-        # Question Classifier
-        self.classifier = QuestionClassifier(self._version, self.model)
 
     # entry function, processes the list of messages and returns a dictionary with the results
-    def process_turn(self, messages_list: list, skip_summary: bool = False, debugging: bool = False) -> dict:
+    def process_turn(self, messages_list: list, skip_summary: bool = False, debugging: bool = False, llm_session: LLMSession = None) -> dict:
         """Process the list of messages and return a dictionary with the results."""
         if debugging:
             logger.info(f'DEBUGGING: {debugging}')
             return {'category': None, 'answer': 'DEBUGGING MODE ENABLED', 'debug': {'debugging': debugging}}
 
+        # Set thread-local session for push_frontend functions to use correct callback
+        from .utils.push_frontend import set_thread_llm_session
+        set_thread_llm_session(llm_session)
+        
+        classifier = QuestionClassifier(self._version, llm_session)
+
         # get contextualized question from this and last user inquiry
         push_frontend_event('<span class="text-gray-400 italic">🤔 Copilot is understanding your request...</span><br/>', replace=False)
         this_inquiry = messages_list[-1]['content']
@@ -52,42 +53,44 @@ def process_turn(self, messages_list: list, skip_summary: bool = False, debuggin
 
         # classify the question to determine the solution source and method
         push_frontend_event('<span class="text-gray-400 italic">🔍 Copilot is finding the right the data source...</span><br/>', replace=False)
-        question_type = self.classifier.classify_question(question)
+        question_type = classifier.classify_question(question)
         # objective, concern in the question
         obj, con = question_type.get('lv0_object', '3. [general]'), question_type.get('lv1_concern', '0. [others]')
 
         # verion f3, resolves objective 8 (Lucia Training Platform)
         if self._version == 'f3':
             if obj.count('8') > 0:
-                answer, debug = self.query_ltp(question, con, skip_summary)
+                answer, debug = self.query_ltp(question, con, skip_summary, llm_session)
             elif obj.count('3') > 0:
-                answer = self.gen_smart_help_general(question)
+                answer = self.gen_smart_help_general(question, llm_session)
                 debug = {}
             elif obj.count('9') > 0:
                 help_keys = ['feature']
-                answer = gen_smart_help(self.help_msg, question, help_keys)
+                answer = gen_smart_help(self.help_msg, question, help_keys, llm_session=llm_session)
                 debug = {}
             else:
                 help_keys = ['unsupported_question']
-                answer = gen_smart_help(self.help_msg, question, help_keys)
+                answer = gen_smart_help(self.help_msg, question, help_keys, llm_session=llm_session)
                 debug = {}
         else:
             # Placeholder for other version implementations
             help_keys = ['unsupported_question']
-            answer = gen_smart_help(self.help_msg, question, help_keys)
+            answer = gen_smart_help(self.help_msg, question, help_keys, llm_session=llm_session)
             debug = {}
+        
         return {'category': question_type, 'answer': answer, 'debug': debug}
 
-    def query_ltp(self, question: str, con: str, skip_summary: bool) -> tuple[str, dict]:
+    def query_ltp(self, question: str, con: str, skip_summary: bool, llm_session: LLMSession) -> tuple[str, dict]:
         """Query about Lucia Training Platform."""
-        # Mapping concern codes to handler functions
+        # Mapping concern codes to handler functions  
+        # Updated to pass llm_session to prevent singleton blocking
         handlers = {
-            '1': lambda: query_metrics(question, self.help_msg, skip_summary),
-            '2': lambda: query_metadata(question, self.help_msg, skip_summary),
-            '3': lambda: query_user_manual(question, self.help_msg),
-            '4': lambda: query_powerbi(question, self.help_msg),
-            '5': lambda: ltp_auto_reject(question, self.help_msg),
-            '6': lambda: ltp_human_intervention(question, self.help_msg),
+            '1': lambda: query_metrics(question, self.help_msg, skip_summary, llm_session),
+            '2': lambda: query_metadata(question, self.help_msg, skip_summary, llm_session),
+            '3': lambda: query_user_manual(question, self.help_msg, llm_session),
+            '4': lambda: query_powerbi(question, self.help_msg, llm_session),
+            '5': lambda: ltp_auto_reject(question, self.help_msg, llm_session),
+            '6': lambda: ltp_human_intervention(question, self.help_msg, llm_session),
         }
         for code, handler in handlers.items():
             if con.count(code) > 0:
@@ -95,13 +98,13 @@ def query_ltp(self, question: str, con: str, skip_summary: bool) -> tuple[str, d
         return 'unsupported concern.', {}
 
     # generate generic smart help message based on user input
-    def gen_smart_help_general(self, question: str) -> str:
+    def gen_smart_help_general(self, question: str, llm_session: LLMSession) -> str:
         """Generate smart help message based on user input."""
         system_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'gen_smart_help_prompt_general.txt'))
         if isinstance(self.help_msg, dict) and 'feature' in self.help_msg:
             system_prompt = system_prompt + '\n\n' + self.help_msg['feature']
         push_frontend_event('<span class="text-gray-400 italic">🌐 Accessing public information...</span><br/>', replace=False)
-        summary = self.model.try_stream_fallback_chat(system_prompt, f'question is: {question}')
+        summary = llm_session.try_stream_fallback_chat(system_prompt, f'question is: {question}')
         return summary
 
     def get_preload_dashboard(self):
diff --git a/src/copilot-chat/src/copilot_agent/ltp/ltp.py b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
index c349a454..27ee6473 100644
--- a/src/copilot-chat/src/copilot_agent/ltp/ltp.py
+++ b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
@@ -31,7 +31,7 @@
 SKIP_LUCIA_CONTROLLER_EXECUTION = True
 
 # session: query cluster or job metrics from Prometheus REST API
-def query_metrics(question: str, help_msg, skip_summary: bool = False):
+def query_metrics(question: str, help_msg, skip_summary: bool = False, llm_session=None):
     """Query cluster or job metrics from Prometheus backend."""
     
     if SKIP_LUCIA_CONTROLLER_EXECUTION:
@@ -69,6 +69,7 @@ def query_metrics(question: str, help_msg, skip_summary: bool = False):
         None,
         help_msg,
         skip_summary,
+        llm_session
     )
 
     # generate additional info dict
@@ -87,7 +88,7 @@ def query_metrics(question: str, help_msg, skip_summary: bool = False):
 
 
 # session: query job metadata from OpenPAI backend
-def query_metadata(question: str, help_msg, skip_summary: bool = False):
+def query_metadata(question: str, help_msg, skip_summary: bool = False, llm_session=None):
     """Query job metadata from OpenPAI backend."""
     # generate query
     logger.info('Generating Query: LTP, Metadata')
@@ -116,6 +117,7 @@ def query_metadata(question: str, help_msg, skip_summary: bool = False):
         None,
         help_msg,
         skip_summary,
+        llm_session
     )
 
     # generate additional info dict
@@ -126,7 +128,7 @@ def query_metadata(question: str, help_msg, skip_summary: bool = False):
 
 
 # session: query user manual from LTP documentation
-def query_user_manual(question: str, help_msg):
+def query_user_manual(question: str, help_msg, llm_session=None):
     """Query user manual."""
     # read documentation
     documentation = get_prompt_from(os.path.join(PROMPT_DIR, SUB_FEATURE, 'ltp_documentation.txt'))
@@ -134,7 +136,7 @@ def query_user_manual(question: str, help_msg):
 
     # generate answer
     logger.info('Generating Answer: LTP, User Manual')
-    summary = gen_summary(SUB_FEATURE, ltp_doc, None, question, 'gen_result_summary_doc.txt', None, help_msg)
+    summary = gen_summary(SUB_FEATURE, ltp_doc, None, question, 'gen_result_summary_doc.txt', None, help_msg, llm_session=llm_session)
 
     info_dict = {}
     return summary, info_dict
@@ -166,7 +168,7 @@ def get_brief_job_metadata(resp):
     return job_metadatas
 
 
-def query_powerbi(question: str, help_msg):
+def query_powerbi(question: str, help_msg, llm_session=None):
     """Query PowerBI data."""
 
     # send HTML snippet so frontend (with rehype-raw enabled) can render with Tailwind styling
@@ -202,6 +204,7 @@ def query_powerbi(question: str, help_msg):
         None,
         help_msg,
         False,
+        llm_session
     )
 
     if response_status == 0:
@@ -215,7 +218,7 @@ def query_powerbi(question: str, help_msg):
     return summary, info_dict
 
 
-def ltp_auto_reject(question: str, help_msg):
+def ltp_auto_reject(question: str, help_msg, llm_session=None):
     """Auto rejected, unsupported by design."""
 
     logger.info('Generating Answer: LTP, Auto Rejection')
@@ -228,13 +231,14 @@ def ltp_auto_reject(question: str, help_msg):
         None,
         help_msg,
         False,
+        llm_session
     )
 
     info_dict = {}
     return summary, info_dict
 
 
-def ltp_human_intervention(question: str, help_msg):
+def ltp_human_intervention(question: str, help_msg, llm_session=None):
     """Handle human intervention for LTP auto rejection."""
 
     logger.info('Generating Answer: LTP, Human Intervention')
@@ -247,6 +251,7 @@ def ltp_human_intervention(question: str, help_msg):
         None,
         help_msg,
         False,
+        llm_session
     )
 
     info_dict = {}
diff --git a/src/copilot-chat/src/copilot_agent/utils/authentication.py b/src/copilot-chat/src/copilot_agent/utils/authentication.py
index ba1f4f8b..3d532a99 100644
--- a/src/copilot-chat/src/copilot_agent/utils/authentication.py
+++ b/src/copilot-chat/src/copilot_agent/utils/authentication.py
@@ -6,6 +6,7 @@
 import os
 import requests
 import urllib.parse
+import threading
 
 from datetime import datetime, timezone
 
@@ -20,6 +21,8 @@ def __init__(self, expiration_ms: int = 3600000):
         self.restserver_url = os.getenv('RESTSERVER_URL', '')
         valid_vcs_env = os.getenv('COPILOT_VALID_VCS', 'admin,superuser')
         self.valid_vcs = [g.strip() for g in valid_vcs_env.split(',') if g.strip()]
+        # Add thread safety for authentication state
+        self._auth_lock = threading.Lock()
 
     def sanitize_username(self, username: str) -> str:
         """Sanitize the username by URL-encoding it to prevent path traversal or injection attacks."""
@@ -71,42 +74,45 @@ def set_authenticate_state(self, username: str, token: str) -> None:
         """Set the authentication state for a user, storing admin and virtualCluster info."""
         expires_at = int(datetime.now(timezone.utc).timestamp() * 1000) + self.expiration_ms
         is_admin, virtual_cluster = self.authenticate(username, token)
-        if is_admin is not None and virtual_cluster is not None:
-            self.authenticate_state[username] = {
-                'token': token,
-                'expires_at': expires_at,
-                'is_admin': is_admin,
-                'virtual_cluster': virtual_cluster
-            }
-        else:
-            self.revoke(username)
+        with self._auth_lock:
+            if is_admin is not None and virtual_cluster is not None:
+                self.authenticate_state[username] = {
+                    'token': token,
+                    'expires_at': expires_at,
+                    'is_admin': is_admin,
+                    'virtual_cluster': virtual_cluster
+                }
+            else:
+                self.authenticate_state.pop(username, None)  # Thread-safe removal
 
     def is_authenticated(self, username: str) -> bool:
-        state = self.authenticate_state.get(username)
-        now = int(datetime.now(timezone.utc).timestamp() * 1000)
-        if not state:
-            return False
-        if state['expires_at'] < now:
-            self.revoke(username)
-            return False
-        if "is_admin" not in state:
-            return False
-        if "virtual_cluster" not in state:
-            return False
-        if "is_admin" in state and "virtual_cluster" in state:
-            if state["is_admin"]:
-                # validate pass condition one: user is an admin
-                return True
-            elif not state["is_admin"] and self.get_membership(state["virtual_cluster"]):
-                # validate pass condition two: user is not an admin, but it belongs to a valid virtualCluster
-                return True
-            else:
+        with self._auth_lock:
+            state = self.authenticate_state.get(username)
+            now = int(datetime.now(timezone.utc).timestamp() * 1000)
+            if not state:
+                return False
+            if state['expires_at'] < now:
+                # Expired, remove from state
+                self.authenticate_state.pop(username, None)
+                return False
+            if "is_admin" not in state:
                 return False
-        return False
+            if "virtual_cluster" not in state:
+                return False
+            if "is_admin" in state and "virtual_cluster" in state:
+                if state["is_admin"]:
+                    # validate pass condition one: user is an admin
+                    return True
+                elif not state["is_admin"] and self.get_membership(state["virtual_cluster"]):
+                    # validate pass condition two: user is not an admin, but it belongs to a valid virtualCluster
+                    return True
+                else:
+                    return False
+            return False
 
     def get_membership(self, groups: list) -> bool:
         return any(group in self.valid_vcs for group in groups)
 
     def revoke(self, username: str):
-        if username in self.authenticate_state:
-            del self.authenticate_state[username]
+        with self._auth_lock:
+            self.authenticate_state.pop(username, None)
diff --git a/src/copilot-chat/src/copilot_agent/utils/llmsession.py b/src/copilot-chat/src/copilot_agent/utils/llmsession.py
index 38dac469..aac17e16 100644
--- a/src/copilot-chat/src/copilot_agent/utils/llmsession.py
+++ b/src/copilot-chat/src/copilot_agent/utils/llmsession.py
@@ -9,28 +9,41 @@
 from ..utils.logger import logger
 
 class LLMSession:
-    """A class to interact with the Azure OpenAI model."""
-    # Global stream callback that external code (server endpoint) can set
-    _global_stream_callback = None
+    """A thread-safe class for interacting with the Azure OpenAI model."""
+    _global_stream_callback = None  # Class-level attribute for backward compatibility
+    _config_cache = None  # Cache configuration to avoid repeated env var reads
+    _config_lock = threading.Lock()  # Lock for config cache
 
     def __init__(self):
-        # Env Var to set the LLM provider, accepted values are 'openai' or 'azure'
-        self.provider = os.environ.get("COPILOT_LLM_PROVIDER")
-        logger.info(f'COPILOT LLM Endpoint Provider: {self.provider}')
-        self.azure_api_key = os.environ.get("AZURE_OPENAI_API_KEY")
-        self.openai_api_key = os.environ.get("OPENAI_API_KEY")
-        self.endpoint = os.environ.get("COPILOT_LLM_ENDPOINT")
-        self.embedding_url = os.environ.get("COPILOT_EMBEDDING_URL")
-        self.model_name = os.environ.get("COPILOT_LLM_MODEL")
-        self.model_version = os.environ.get("COPILOT_LLM_VERSION")
-        self.embedding_model_name = os.environ.get("COPILOT_EMBEDDING_MODEL")
+        """Initialize a new LLMSession instance per request."""
+        # Use cached config to avoid repeated environment variable reads
+        config = self._get_cached_config()
+        
+        self.provider = config['provider']
+        self.azure_api_key = config['azure_api_key']
+        self.openai_api_key = config['openai_api_key']
+        self.endpoint = config['endpoint']
+        self.model_name = config['model_name']
+        self.model_version = config['model_version']
+        self.embedding_model_name = config['embedding_model_name']
+        
+        # Create a separate session for each instance to avoid blocking
+        self.session = requests.Session()
+        
+        # Per-instance stream callback to avoid cross-user contamination
+        self._instance_stream_callback = None
+        retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
+        adapter = HTTPAdapter(max_retries=retries)
+        self.session.mount('https://', adapter)
+        self.session.headers.update({"Authorization": f"Bearer {self.openai_api_key or self.azure_api_key}"})
+
         if self.provider == "openai":
             self.model = openai.OpenAI(
                 base_url=self.endpoint,
                 api_key=self.openai_api_key
             )
             self.embedding_model = openai.OpenAI(
-                base_url=self.embedding_url,
+                base_url=self.endpoint,
                 api_key=self.openai_api_key
             )
         elif self.provider == "azure":
@@ -40,7 +53,7 @@ def __init__(self):
                 api_version=self.model_version
             )
             self.embedding_model = openai.AzureOpenAI(
-                azure_endpoint=self.embedding_url,
+                azure_endpoint=self.endpoint,
                 api_key=self.azure_api_key,
                 api_version=self.model_version
             )
@@ -48,6 +61,28 @@ def __init__(self):
             logger.error(f'Unsupported LLM provider: {self.provider}')
             raise ValueError(f'Unsupported LLM provider: {self.provider}')
 
+    @classmethod
+    def _get_cached_config(cls):
+        """Get cached configuration or read from environment variables."""
+        if cls._config_cache is None:
+            with cls._config_lock:
+                if cls._config_cache is None:  # Double-check pattern
+                    cls._config_cache = {
+                        'provider': os.environ.get("COPILOT_LLM_PROVIDER"),
+                        'azure_api_key': os.environ.get("AZURE_OPENAI_API_KEY"),
+                        'openai_api_key': os.environ.get("OPENAI_API_KEY"),
+                        'endpoint': os.environ.get("COPILOT_LLM_ENDPOINT"),
+                        'model_name': os.environ.get("COPILOT_LLM_MODEL"),
+                        'model_version': os.environ.get("COPILOT_LLM_VERSION"),
+                        'embedding_model_name': os.environ.get("COPILOT_EMBEDDING_MODEL")
+                    }
+                    logger.info(f'COPILOT LLM Endpoint Provider: {cls._config_cache["provider"]}')
+        return cls._config_cache
+
+    def close_session(self):
+        """Close the persistent HTTP session."""
+        self.session.close()
+
     def chat(self, system_prompt, user_prompt):
         """Chat with the language model."""
         msg = [
@@ -190,13 +225,13 @@ def stream_chat(self, system_prompt, user_prompt):
                     if chunk:
                         # accumulate to reconstruct full text and yield the full snapshot
                         full += chunk
-                        # call global callback if configured (external subscribers)
+                        # call instance callback first (higher priority), then global callback
                         try:
-                            cb = LLMSession._global_stream_callback
+                            cb = self._instance_stream_callback or LLMSession._global_stream_callback
                             if cb:
                                 cb(full)
                         except Exception:
-                            logger.debug('Global stream callback failed')
+                            logger.debug('Stream callback failed')
                         yield full
 
                 # If stream finishes without exception, stop generator normally
@@ -223,13 +258,21 @@ def set_global_stream_callback(cls, cb):
     def clear_global_stream_callback(cls):
         cls._global_stream_callback = None
 
+    def set_instance_stream_callback(self, cb):
+        """Set per-instance stream callback to avoid cross-user contamination."""
+        self._instance_stream_callback = cb
+
+    def clear_instance_stream_callback(self):
+        """Clear per-instance stream callback."""
+        self._instance_stream_callback = None
+
     def try_stream_fallback_chat(self, system_prompt: str, user_prompt: str) -> str:
         """Try streaming the response (if a global stream callback is set) and fall back to the blocking chat call.
 
         Returns the final aggregated text.
         """
         try:
-            if getattr(LLMSession, '_global_stream_callback', None):
+            if self._instance_stream_callback or getattr(LLMSession, '_global_stream_callback', None):
                 logger.info('LLMSession: streaming via try_stream_fallback_chat')
                 last = ''
                 for snapshot in self.stream_chat(system_prompt, user_prompt):
diff --git a/src/copilot-chat/src/copilot_agent/utils/push_frontend.py b/src/copilot-chat/src/copilot_agent/utils/push_frontend.py
index 26598c2b..c2788a58 100644
--- a/src/copilot-chat/src/copilot_agent/utils/push_frontend.py
+++ b/src/copilot-chat/src/copilot_agent/utils/push_frontend.py
@@ -1,13 +1,33 @@
 import json
+import threading
 
 from ..utils.logger import logger
 from ..utils.llmsession import LLMSession
 
+# Thread-local storage to track the current LLM session for this request
+_thread_local = threading.local()
+
+def set_thread_llm_session(llm_session):
+    """Set the LLM session for the current thread (for streaming context)."""
+    _thread_local.llm_session = llm_session
+
+def get_thread_llm_session():
+    """Get the LLM session for the current thread."""
+    return getattr(_thread_local, 'llm_session', None)
+
 def push_frontend_event(content: str, replace: bool = False):
     """Push an event to the frontend."""
-    # If streaming is active, push only the appended content as a JSON "append" event
+    # Try to use thread-local LLM session first (for per-user streaming)
+    # Fall back to global callback for backward compatibility
     try:
-        cb = LLMSession._global_stream_callback
+        cb = None
+        thread_session = get_thread_llm_session()
+        if thread_session and hasattr(thread_session, '_instance_stream_callback'):
+            cb = thread_session._instance_stream_callback
+        
+        if not cb:
+            cb = LLMSession._global_stream_callback
+            
         if cb:
             if replace:
                 cb(content)
@@ -20,7 +40,14 @@ def push_frontend_event(content: str, replace: bool = False):
 def push_frontend_meta(message_info: dict):
     """Push a metadata event (messageInfo) to the frontend so client can attach turnId before answer arrives."""
     try:
-        cb = LLMSession._global_stream_callback
+        cb = None
+        thread_session = get_thread_llm_session()
+        if thread_session and hasattr(thread_session, '_instance_stream_callback'):
+            cb = thread_session._instance_stream_callback
+        
+        if not cb:
+            cb = LLMSession._global_stream_callback
+            
         if cb:
             cb(json.dumps({"type": "meta", "messageInfo": message_info}))
     except Exception as e:
diff --git a/src/copilot-chat/src/copilot_agent/utils/smart_help.py b/src/copilot-chat/src/copilot_agent/utils/smart_help.py
index 3560c44b..e0679b1b 100644
--- a/src/copilot-chat/src/copilot_agent/utils/smart_help.py
+++ b/src/copilot-chat/src/copilot_agent/utils/smart_help.py
@@ -9,11 +9,10 @@
 from ..utils.llmsession import LLMSession
 from ..utils.utils import get_prompt_from
 
-model = LLMSession()
 
 
 # generate help message for CoPilot agent
-def gen_smart_help(help_msg, user_question: str, key_lst: list, SMART_HELP=True) -> str:
+def gen_smart_help(help_msg, user_question: str, key_lst: list, SMART_HELP=True, llm_session=None) -> str:
     """Generate smart help message for CoPilot agent."""
     # dump help method
     dump_help = ''
@@ -43,10 +42,10 @@ def gen_smart_help(help_msg, user_question: str, key_lst: list, SMART_HELP=True)
         question_prompt = f'[user question]\n {user_question} \n\n'
         user_prompt = question_prompt + f'[reason to generate the help]\n str{key_lst} \n\n' + capability_promp
         # send to a LLM session to generate a smart help
-        smart_help = model.try_stream_fallback_chat(sys_prompt, user_prompt)
+        smart_help = llm_session.try_stream_fallback_chat(sys_prompt, user_prompt)
         final_help = smart_help
     else:
         dump_help_prompt = f'[reason to generate the help]\n {dump_help} \n\n'
-        final_help = model.try_stream_fallback_chat(sys_prompt, dump_help_prompt)
+        final_help = llm_session.try_stream_fallback_chat(sys_prompt, dump_help_prompt)
 
     return final_help
diff --git a/src/copilot-chat/src/copilot_agent/utils/summary.py b/src/copilot-chat/src/copilot_agent/utils/summary.py
index 886dd94e..3a000581 100644
--- a/src/copilot-chat/src/copilot_agent/utils/summary.py
+++ b/src/copilot-chat/src/copilot_agent/utils/summary.py
@@ -14,10 +14,9 @@
 from ..utils.smart_help import gen_smart_help
 from ..utils.utils import get_prompt_from
 
-model = LLMSession()
 
 def gen_summary(
-    SUB_FEATURE, resp_total, resp_brief, question, gen_prompt_file, knowledge_prompt_file, help_msg, skip_summary=False
+    SUB_FEATURE, resp_total, resp_brief, question, gen_prompt_file, knowledge_prompt_file, help_msg, skip_summary=False, llm_session=None
 ):
     """Generate a summary."""
     logger.info('Generating Response Report')
@@ -45,7 +44,7 @@ def gen_summary(
         if not skip_summary:
             logger.info('Bypass summary: False')
             # try stream chat, if fail, fall back to chat
-            summary = model.try_stream_fallback_chat(sys_prompt, user_prompt)
+            summary = llm_session.try_stream_fallback_chat(sys_prompt, user_prompt)
         else:
             logger.info('Bypass summary: True')
             summary = handle_bypass_summary(resp_total, resp_brief)
@@ -53,7 +52,7 @@ def gen_summary(
         logger.info('generating smart help')
         help_keys = ['corrupted_data']
         summary = help_keys[0]
-        summary = gen_smart_help(help_msg, question, help_keys)
+        summary = gen_smart_help(help_msg, question, help_keys, llm_session)
     return summary
 
 

From a0c50a0370866167cf909b329d5ea806ccdd6c58 Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Fri, 10 Oct 2025 17:15:04 -0700
Subject: [PATCH 08/34] fix: minor bug for non-streaming api

---
 src/copilot-chat/src/copilot_agent/copilot_service.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/copilot-chat/src/copilot_agent/copilot_service.py b/src/copilot-chat/src/copilot_agent/copilot_service.py
index be7bae13..54ed154e 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_service.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_service.py
@@ -51,10 +51,11 @@ def status(self):
     def instance_operation(self):
         """POST endpoint to handle copilot operations."""
         logger.info("Received request at /copilot/api/operation")
+        llm_session = LLMSession()
         try:
             data = request.get_json()
             in_parameters = self.copilot_conversation.build_in_parameters(data)
-            out_parameters = self.copilot_conversation.perform_operation(in_parameters)
+            out_parameters = self.copilot_conversation.perform_operation(in_parameters, llm_session=llm_session)
             response = {
                 "status": "success",
                 "data": out_parameters.__dict__

From f4c3f1b01ef6eb6d38fb70d1bbeba219e359460f Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Fri, 10 Oct 2025 17:19:54 -0700
Subject: [PATCH 09/34] improve user experience: post unauthorized access
 information to unauthorized users

---
 contrib/copilot-plugin/package.json                  |  1 +
 .../src/copilot_agent/copilot_conversation.py        | 12 ++++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/contrib/copilot-plugin/package.json b/contrib/copilot-plugin/package.json
index 7de6c3b3..5888a3ce 100644
--- a/contrib/copilot-plugin/package.json
+++ b/contrib/copilot-plugin/package.json
@@ -78,6 +78,7 @@
     "start": "node scripts/start.js",
     "start:user1": "REACT_APP_USER=dev.eva PORT=3000 node scripts/start.js",
     "start:user2": "REACT_APP_USER=dev.ben PORT=3001 node scripts/start.js",
+    "start:user3": "REACT_APP_USER=dev.unknown PORT=3002 node scripts/start.js",
     "build": "node scripts/build.js",
     "test": "node scripts/test.js",
     "clean": "rimraf build"
diff --git a/src/copilot-chat/src/copilot_agent/copilot_conversation.py b/src/copilot-chat/src/copilot_agent/copilot_conversation.py
index 6d6d68c2..e292a1b7 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_conversation.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_conversation.py
@@ -106,7 +106,7 @@ def perform_operation(self, in_parameters: InParameters, llm_session: LLMSession
                 self.auth_manager.set_authenticate_state(username, rest_token)
                 if not self.auth_manager.is_authenticated(username):
                     logger.error(f'User {username} failed authentication twice. Aborting operation.')
-                    result = self._handle_authenticate_failure(user_id, conv_id, turn_id)
+                    result = self._handle_authenticate_failure(user_id, conv_id, turn_id, llm_session)
                 else:
                     logger.info(f'User {username} authenticated successfully.')
                     result = self._handle_user_question(user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info, llm_session)
@@ -164,11 +164,19 @@ def _handle_empty_input(self):
         out_parameters = OutParameters(resp)
         return out_parameters
 
-    def _handle_authenticate_failure(self, user_id, conv_id, turn_id):
+    def _handle_authenticate_failure(self, user_id, conv_id, turn_id, llm_session: LLMSession = None):
         """Handle authentication failure case."""
         logger.info('User authentication failed. Aborting operation.')
         resp = self._make_skip_response(user_id, conv_id, turn_id, 'error')
         out_parameters = OutParameters(resp)
+
+        # If LLM session is available, set up thread-local context and use push_frontend_event
+        if llm_session:
+            from .utils.push_frontend import set_thread_llm_session, push_frontend_event
+            set_thread_llm_session(llm_session)
+            error_message = '<span class="text-gray-400 italic">Unauthorized - Authentication failed</span>'
+            push_frontend_event(error_message)
+        
         return out_parameters
 
     def _handle_user_question(self, user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info, llm_session: LLMSession):

From 7776476f420a712cbbcd3c1a3bf01af0f434a68e Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Tue, 14 Oct 2025 16:08:32 -0700
Subject: [PATCH 10/34] code cleanup: remove unnecessary llmsession instances

---
 .../src/copilot_agent/copilot_turn.py         |  2 +-
 src/copilot-chat/src/copilot_agent/ltp/ltp.py |  4 +---
 .../src/copilot_agent/ltp/ltp_dashboard.py    |  2 --
 .../utils/conversation_manager.py             |  6 ++---
 .../src/copilot_agent/utils/dcw.py            |  9 +++-----
 .../src/copilot_agent/utils/query.py          | 22 +++++++++----------
 6 files changed, 17 insertions(+), 28 deletions(-)

diff --git a/src/copilot-chat/src/copilot_agent/copilot_turn.py b/src/copilot-chat/src/copilot_agent/copilot_turn.py
index eea8dffb..1d0e1cd3 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_turn.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_turn.py
@@ -49,7 +49,7 @@ def process_turn(self, messages_list: list, skip_summary: bool = False, debuggin
         push_frontend_event('<span class="text-gray-400 italic">🤔 Copilot is understanding your request...</span><br/>', replace=False)
         this_inquiry = messages_list[-1]['content']
         last_inquiry = messages_list[-3]['content'] if len(messages_list) > 2 else None
-        question = contextualize_question(this_inquiry, last_inquiry)
+        question = contextualize_question(this_inquiry, last_inquiry, llm_session)
 
         # classify the question to determine the solution source and method
         push_frontend_event('<span class="text-gray-400 italic">🔍 Copilot is finding the right the data source...</span><br/>', replace=False)
diff --git a/src/copilot-chat/src/copilot_agent/ltp/ltp.py b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
index 27ee6473..bb29418d 100644
--- a/src/copilot-chat/src/copilot_agent/ltp/ltp.py
+++ b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
@@ -24,8 +24,6 @@
 from ..utils.utils import get_prompt_from
 from .ltp_dashboard import query_generation_kql
 
-model = LLMSession()
-
 SUB_FEATURE = 'ltp'
 
 SKIP_LUCIA_CONTROLLER_EXECUTION = True
@@ -44,7 +42,7 @@ def query_metrics(question: str, help_msg, skip_summary: bool = False, llm_sessi
     else:
         # generate query
         logger.info('Generating Query: LTP, Metrics')
-        query, end_time_stamp, parallel, param = gen_promql_query(SUB_FEATURE, question)
+        query, end_time_stamp, parallel, param = gen_promql_query(SUB_FEATURE, question, llm_session)
 
         if not query:
             logger.info(f'No query found in the response, query is {query}')
diff --git a/src/copilot-chat/src/copilot_agent/ltp/ltp_dashboard.py b/src/copilot-chat/src/copilot_agent/ltp/ltp_dashboard.py
index ff545518..dc92f626 100644
--- a/src/copilot-chat/src/copilot_agent/ltp/ltp_dashboard.py
+++ b/src/copilot-chat/src/copilot_agent/ltp/ltp_dashboard.py
@@ -15,8 +15,6 @@
 
 from ..config import DATA_DIR
 
-model = LLMSession()
-
 
 # Document preparation utilities
 class DocPrepare:
diff --git a/src/copilot-chat/src/copilot_agent/utils/conversation_manager.py b/src/copilot-chat/src/copilot_agent/utils/conversation_manager.py
index fbd36cc1..0f6a275e 100644
--- a/src/copilot-chat/src/copilot_agent/utils/conversation_manager.py
+++ b/src/copilot-chat/src/copilot_agent/utils/conversation_manager.py
@@ -15,9 +15,7 @@
 from ..utils.llmsession import LLMSession
 from ..utils.utils import get_prompt_from, extract_json_dict
 
-model = LLMSession()
-
-def contextualize_question(question: str, last_question: str | None) -> str:
+def contextualize_question(question: str, last_question: str | None, llm_session: LLMSession) -> str:
     """Contextualizes the current question based on the last question."""
     logger.info(f"Contextualizing question: '{question}' based on last question: '{last_question}'")
     if last_question is None:
@@ -28,7 +26,7 @@ def contextualize_question(question: str, last_question: str | None) -> str:
             'this_question': question,
             'last_question': last_question,
         })
-        new_question_str = model.chat(contextualize_prompt, user_prompt)
+        new_question_str = llm_session.chat(contextualize_prompt, user_prompt)
         new_question_dict = extract_json_dict(new_question_str, nested=False)
         if isinstance(new_question_dict, dict):
             new_question = new_question_dict.get('new_question', question)
diff --git a/src/copilot-chat/src/copilot_agent/utils/dcw.py b/src/copilot-chat/src/copilot_agent/utils/dcw.py
index 1f6f685f..111727b5 100644
--- a/src/copilot-chat/src/copilot_agent/utils/dcw.py
+++ b/src/copilot-chat/src/copilot_agent/utils/dcw.py
@@ -16,14 +16,11 @@
 from ..utils.types import DCW, Customer, Design
 from ..utils.utils import get_prompt_from
 
-model = LLMSession()
-
-
 # generate dcw (design, criteria, workload) from using question
-def gen_dcw(user_prompt: str, map_existing: bool) -> DCW:
+def gen_dcw(user_prompt: str, map_existing: bool, llm_session: LLMSession) -> DCW:
     """Generate a DCW object from the user's question."""
     sys_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'gen_dcw_prompt.txt'))
-    resp = model.chat(sys_prompt, user_prompt)
+    resp = llm_session.chat(sys_prompt, user_prompt)
     logger.info(f'gen_dcw, user_prompt, is {user_prompt}')
     logger.info(f'gen_dcw, resp, is {resp}')
     if 'target' not in resp.lower() and 'baseline' not in resp.lower():
@@ -41,7 +38,7 @@ def gen_dcw(user_prompt: str, map_existing: bool) -> DCW:
 
         if all(item not in dcw.Workload.lower() for item in revise_json['list']):
             logger.info(f'before revise: {dcw}')
-            dcw.Workload = model.chat(revise_sys_prompt, revise_user_prompt)
+            dcw.Workload = llm_session.chat(revise_sys_prompt, revise_user_prompt)
             logger.info(f'after revise: {dcw}')
 
     return dcw
diff --git a/src/copilot-chat/src/copilot_agent/utils/query.py b/src/copilot-chat/src/copilot_agent/utils/query.py
index 44cc25e2..db1ba6fa 100644
--- a/src/copilot-chat/src/copilot_agent/utils/query.py
+++ b/src/copilot-chat/src/copilot_agent/utils/query.py
@@ -17,35 +17,33 @@
 from ..utils.time import get_current_unix_timestamp
 from ..utils.utils import extract_json_dict, get_prompt_from
 
-model = LLMSession()
 
-
-def gen_kusto_query_pseudo(SUB_FEATURE: str, gen_prompt_file: str, user_prompt: str) -> dict:
+def gen_kusto_query_pseudo(SUB_FEATURE: str, gen_prompt_file: str, user_prompt: str, llm_session: LLMSession) -> dict:
     """Generate controller input."""
     sys_prompt = get_prompt_from(os.path.join(PROMPT_DIR, SUB_FEATURE, gen_prompt_file))
-    resp = model.chat(sys_prompt, user_prompt)
+    resp = llm_session.chat(sys_prompt, user_prompt)
     controller_input = extract_json_dict(benchmark=resp, nested=False)
     return controller_input
 
 
-def gen_kusto_query_fallback_pseudo(question, SUB_FEATURE, gen_fallback_prompt) -> str:
+def gen_kusto_query_fallback_pseudo(question, SUB_FEATURE, gen_fallback_prompt, llm_session: LLMSession) -> str:
     """Generate fallback query from RCA to status."""
     logger.info('Generate a fall back status query')
     system_prompt = get_prompt_from(os.path.join(PROMPT_DIR, SUB_FEATURE, gen_fallback_prompt))
     user_prompt = f'<user input> is\n{question}'
-    resp = model.chat(system_prompt, user_prompt)
+    resp = llm_session.chat(system_prompt, user_prompt)
     fall_back_question = resp.replace('```', '')
     return fall_back_question
 
 
-def gen_sql_query(SUB_FEATURE: str, database: SQLManager, question: str) -> str:
+def gen_sql_query(SUB_FEATURE: str, database: SQLManager, question: str, llm_session: LLMSession) -> str:
     """Generate a general SQL query."""
     logger.info('Generate a SQL query')
     generation_prompt = get_prompt_from(os.path.join(PROMPT_DIR, SUB_FEATURE, 'gen_query_sql_general.txt'))
     accepeted_value_prompt = database.get_unique_values()
     system_prompt = generation_prompt + f'<accepted values> are\n{accepeted_value_prompt}'
     user_prompt = f'<user input> is\n{question}'
-    resp = model.chat(system_prompt, user_prompt)
+    resp = llm_session.chat(system_prompt, user_prompt)
     logger.info(f'resp {resp}')
     matches = re.findall(r'[`\']{3}(.*?)[`\']{3}', resp, re.DOTALL)
     if not matches:
@@ -136,7 +134,7 @@ def _get_promql_param():
     return param
 
 
-def _gen_promql_query_param(SUB_FEATURE: str, question: str) -> dict:
+def _gen_promql_query_param(SUB_FEATURE: str, question: str, llm_session: LLMSession) -> dict:
     """Generate a general PromQL query."""
     logger.info('Generate a PromQL query')
     generation_prompt = get_prompt_from(os.path.join(PROMPT_DIR, SUB_FEATURE, 'gen_query_promql_metrics.txt'))
@@ -144,16 +142,16 @@ def _gen_promql_query_param(SUB_FEATURE: str, question: str) -> dict:
     # logger.info(f'accepeted_value_prompt:\n{accepeted_value_prompt}')
     system_prompt = generation_prompt + f'<accepted values> are\n{accepeted_value_prompt}'
     user_prompt = f'<user input> is\n{question}'
-    resp = model.chat(system_prompt, user_prompt)
+    resp = llm_session.chat(system_prompt, user_prompt)
     logger.info(f'resp:\n{resp}')
     params = extract_json_dict(benchmark=resp, nested=False)
     logger.info(f'params:\n{params}')
     return params
 
 
-def gen_promql_query(SUB_FEATURE: str, question: str) -> tuple[str, bool]:
+def gen_promql_query(SUB_FEATURE: str, question: str, llm_session: LLMSession) -> tuple[str, bool]:
     """Generate a general PromQL query."""
-    params = _gen_promql_query_param(SUB_FEATURE, question)
+    params = _gen_promql_query_param(SUB_FEATURE, question, llm_session)
     if not isinstance(params, dict):
         logger.info(f'No query found in the response, params is {params}')
         return None, None, None, None

From 1e1285c87fe249779160d66484d64ab8eb2b4f16 Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Tue, 14 Oct 2025 16:48:29 -0700
Subject: [PATCH 11/34] code refactor: stage 0

---
 src/copilot-chat/src/copilot_agent/__main__.py             | 4 +---
 src/copilot-chat/src/copilot_agent/copilot_conversation.py | 4 ++--
 src/copilot-chat/src/copilot_agent/copilot_service.py      | 4 ++--
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/copilot-chat/src/copilot_agent/__main__.py b/src/copilot-chat/src/copilot_agent/__main__.py
index eed047a3..e3927657 100644
--- a/src/copilot-chat/src/copilot_agent/__main__.py
+++ b/src/copilot-chat/src/copilot_agent/__main__.py
@@ -4,13 +4,11 @@
 """Main module."""
 
 from .copilot_service import CoPilotService
-from .copilot_conversation import CoPilotConversation
 
 
 def main():
     """Main function."""
-    copilot_conversation = CoPilotConversation()
-    api = CoPilotService(copilot_conversation)
+    api = CoPilotService()
     api.run()
 
 
diff --git a/src/copilot-chat/src/copilot_agent/copilot_conversation.py b/src/copilot-chat/src/copilot_agent/copilot_conversation.py
index e292a1b7..178f024a 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_conversation.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_conversation.py
@@ -64,7 +64,7 @@ class CoPilotConversation:
     def __init__(self):
         """Initialize CoPilotConversation, message history, and authentication manager."""
         print_env_variables()
-        self.copilot = CoPilotTurn(verbose=False)
+        self.copilot_turn = CoPilotTurn(verbose=False)
         self.msg_dict = {}  # Dictionary to store message deques per user
         self.auth_manager = AuthenticationManager()
 
@@ -186,7 +186,7 @@ def _handle_user_question(self, user_id, conv_id, turn_id, user_prompt, skip_sum
         msg_user = {'role': 'user', 'content': user_prompt}
         self.manage_conv_history(user_id, msg_user)
         logger.info(f'[internal control word] [per user check] user "{user_id}" msg_list length is {len(self.msg_dict[user_id])}')
-        resp = self.copilot.process_turn(self.msg_dict[user_id], skip_summary, debugging, llm_session)
+        resp = self.copilot_turn.process_turn(self.msg_dict[user_id], skip_summary, debugging, llm_session)
         if not isinstance(resp, dict):
             logger.info('Unexpected response format from copilot.process_turn')
             return self.handle_unexpected_copilot_response(user_id, conv_id, turn_id)
diff --git a/src/copilot-chat/src/copilot_agent/copilot_service.py b/src/copilot-chat/src/copilot_agent/copilot_service.py
index 54ed154e..5fbac385 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_service.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_service.py
@@ -22,14 +22,14 @@
 # --- New CoPilotAPI class (Flask app setup and endpoints) ---
 class CoPilotService:
     """Flask app and endpoint manager for CoPilot."""
-    def __init__(self, copilot_conversation: CoPilotConversation):
+    def __init__(self):
         """
         Initialize the CoPilotAPI with a CoPilot instance, set up Flask app and endpoints.
 
         Args:
             copilot: Instance of CoPilot business logic class.
         """
-        self.copilot_conversation = copilot_conversation
+        self.copilot_conversation = CoPilotConversation()
         self.host = os.getenv('AGENT_HOST', '127.0.0.1')
         self.app = Flask(__name__)
         self.app.add_url_rule('/copilot/api/status', view_func=self.status, methods=['GET'])

From 99c280adb2ef13635470f61c105c216acdc2cca1 Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Tue, 14 Oct 2025 17:39:58 -0700
Subject: [PATCH 12/34] code refactor: stage 1

---
 src/copilot-chat/src/copilot_agent/config.py               | 3 +--
 src/copilot-chat/src/copilot_agent/copilot_conversation.py | 7 ++++---
 src/copilot-chat/src/copilot_agent/ltp/ltp_dashboard.py    | 4 +++-
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/copilot-chat/src/copilot_agent/config.py b/src/copilot-chat/src/copilot_agent/config.py
index 823f2bb6..661260e7 100644
--- a/src/copilot-chat/src/copilot_agent/config.py
+++ b/src/copilot-chat/src/copilot_agent/config.py
@@ -16,7 +16,7 @@
 # set agent cross talk
 AGENT_PORT = int(os.getenv('AGENT_PORT', '50000'))
 AGENT_MODE_LOCAL = os.getenv('AGENT_MODE', '').lower() == 'local'
-AGENT_MODE_CA_LOCAL = os.getenv('AGENT_MODE_CA', '').lower() == 'local'
+AGENT_MINIMAL_ON = os.getenv('AGENT_MINIMAL', '').lower() == 'on'
 #
 TO_CONTROLLER = True
 #
@@ -31,4 +31,3 @@ def print_env_variables():
     logger.info(f"Env Var: COPILOT_VERSION: {os.getenv('COPILOT_VERSION', 'na')}")
     logger.info(f"Env Var: AGENT_PORT: {os.getenv('AGENT_PORT', '50000')}")
     logger.info(f"Env Var: AGENT_MODE: {os.getenv('AGENT_MODE', 'na')}")
-    logger.info(f"Env Var: AGENT_MODE_CA: {os.getenv('AGENT_MODE_CA', 'na')}")
diff --git a/src/copilot-chat/src/copilot_agent/copilot_conversation.py b/src/copilot-chat/src/copilot_agent/copilot_conversation.py
index 178f024a..26e8cff1 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_conversation.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_conversation.py
@@ -15,7 +15,7 @@
 from .utils.kql_executor import KustoExecutor
 from .utils.push_frontend import push_frontend_event, push_frontend_meta
 
-from .config import AGENT_MODE_LOCAL, print_env_variables
+from .config import AGENT_MINIMAL_ON, print_env_variables
 from .copilot_turn import CoPilotTurn
 from .utils.llmsession import LLMSession
 
@@ -263,8 +263,9 @@ def _log_message_data(self, inout: str, parameters: Union[InParameters, OutParam
             'Debug': debug,
         }
         logger.info(f'[copilot data collection] {log_data}')
-        # ingest kusto table
-        self.collect_data_to_kusto(log_data)
+        if not AGENT_MINIMAL_ON:
+            # ingest kusto table
+            self.collect_data_to_kusto(log_data)
 
     def handle_unexpected_copilot_response(self, user_id: str, conv_id: str, turn_id: str) -> OutParameters:
         """Handle unexpected response format from copilot agent and log error."""
diff --git a/src/copilot-chat/src/copilot_agent/ltp/ltp_dashboard.py b/src/copilot-chat/src/copilot_agent/ltp/ltp_dashboard.py
index dc92f626..11d473f6 100644
--- a/src/copilot-chat/src/copilot_agent/ltp/ltp_dashboard.py
+++ b/src/copilot-chat/src/copilot_agent/ltp/ltp_dashboard.py
@@ -13,7 +13,7 @@
 from ..utils.rag import QueryGeneratorRAG
 from ..utils.logger import logger
 
-from ..config import DATA_DIR
+from ..config import DATA_DIR, AGENT_MINIMAL_ON
 
 
 # Document preparation utilities
@@ -112,6 +112,8 @@ def _init_instance(cls):
         kql_sample_docs_data = DocPrepare.get_txt_as_list_hashtag(KUSTO_SAMPLE_DATA_FILE)
         kql_knowledge_docs = DocPrepare.get_txt_as_list(KUSTO_KNOWLEDGE_FILE)
         kql_schema_docs = kql_table_docs + kql_operator_docs + kql_sample_docs + kql_sample_docs_data + kql_knowledge_docs
+        if AGENT_MINIMAL_ON:
+            kql_schema_docs = []
         logger.info(f'length of kql_schema_docs: {len(kql_schema_docs)}')
         for doc in kql_schema_docs:
             logger.info(doc[:50])

From 7f989e75626b07789fd6cdab9e6bd98ff77c9ea6 Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Wed, 15 Oct 2025 10:16:37 -0700
Subject: [PATCH 13/34] code refactor: stage 2

---
 .../src/copilot_agent/copilot_service.py      | 32 ++++++++++++++-----
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/src/copilot-chat/src/copilot_agent/copilot_service.py b/src/copilot-chat/src/copilot_agent/copilot_service.py
index 5fbac385..227aeb35 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_service.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_service.py
@@ -29,7 +29,7 @@ def __init__(self):
         Args:
             copilot: Instance of CoPilot business logic class.
         """
-        self.copilot_conversation = CoPilotConversation()
+        self.sessions = {}
         self.host = os.getenv('AGENT_HOST', '127.0.0.1')
         self.app = Flask(__name__)
         self.app.add_url_rule('/copilot/api/status', view_func=self.status, methods=['GET'])
@@ -48,14 +48,25 @@ def status(self):
         """GET endpoint for health/status check."""
         return jsonify({"status": "running"})
 
+    def get_or_create_session(self, user_id, conv_id):
+        """Retrieve or create a copilot_conversation for the given userId and convId."""
+        session_key = f"{user_id}_{conv_id}"
+        if session_key not in self.sessions:
+            self.sessions[session_key] = CoPilotConversation()
+        return self.sessions[session_key]
+
     def instance_operation(self):
         """POST endpoint to handle copilot operations."""
         logger.info("Received request at /copilot/api/operation")
-        llm_session = LLMSession()
         try:
             data = request.get_json()
-            in_parameters = self.copilot_conversation.build_in_parameters(data)
-            out_parameters = self.copilot_conversation.perform_operation(in_parameters, llm_session=llm_session)
+            user_id = data['data']['messageInfo']['userId']
+            conv_id = data['data']['messageInfo']['convId']
+            copilot_conversation = self.get_or_create_session(user_id, conv_id)
+
+            llm_session = LLMSession()
+            in_parameters = copilot_conversation.build_in_parameters(data)
+            out_parameters = copilot_conversation.perform_operation(in_parameters, llm_session=llm_session)
             response = {
                 "status": "success",
                 "data": out_parameters.__dict__
@@ -75,6 +86,12 @@ def stream_operation(self):
         logger.info("Received request at /copilot/api/operation/stream")
         try:
             data = request.get_json()
+            user_id = data['data']['messageInfo']['userId']
+            conv_id = data['data']['messageInfo']['convId']
+            copilot_conversation = self.get_or_create_session(user_id, conv_id)
+        except KeyError as e:
+            logger.error(f"Missing key in JSON body for stream_operation: {e}")
+            return jsonify({"status": "error", "message": f"Missing key: {e}"}), 400
         except Exception as e:
             logger.error(f"Failed to parse JSON body for stream_operation: {e}")
             return jsonify({"status": "error", "message": "invalid json"}), 400
@@ -87,15 +104,14 @@ def on_chunk(chunk: str):
 
         def worker():
             # Create a dedicated LLM session for this request with per-instance callback
-            # This eliminates the global callback race condition that causes cross-user contamination
             try:
-                in_parameters = self.copilot_conversation.build_in_parameters(data)
+                in_parameters = copilot_conversation.build_in_parameters(data)
                 # Create a fresh LLM session for this streaming request
                 llm_session = LLMSession()
                 llm_session.set_instance_stream_callback(on_chunk)
-                
+
                 # Pass the dedicated session to the conversation
-                result = self.copilot_conversation.perform_operation(in_parameters, llm_session=llm_session)
+                result = copilot_conversation.perform_operation(in_parameters, llm_session=llm_session)
             except Exception as e:
                 logger.error(f"Error during streaming operation worker: {e}")
                 import traceback

From 856f861070697a7f6475fe20b8171c2c82836fc8 Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Thu, 16 Oct 2025 15:55:18 -0700
Subject: [PATCH 14/34] code refactor: stage 3, use the same llmsession for
 each conversation, pre-load prompts for classifier, contextualizer

---
 .../src/copilot_agent/copilot_conversation.py | 23 ++++-----
 .../src/copilot_agent/copilot_service.py      | 42 +++++++++-------
 .../src/copilot_agent/copilot_turn.py         | 49 ++++++++++---------
 .../src/copilot_agent/utils/__init__.py       |  4 +-
 .../src/copilot_agent/utils/classify.py       | 14 ++++--
 .../utils/conversation_manager.py             | 37 ++++++++------
 6 files changed, 95 insertions(+), 74 deletions(-)

diff --git a/src/copilot-chat/src/copilot_agent/copilot_conversation.py b/src/copilot-chat/src/copilot_agent/copilot_conversation.py
index 26e8cff1..77d86767 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_conversation.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_conversation.py
@@ -61,12 +61,13 @@ def __init__(self, response: dict) -> None:
 # --- New CoPilot class (business logic only) ---
 class CoPilotConversation:
     """CoPilot Conversation, manages the inquiry/response turns for each user."""
-    def __init__(self):
+    def __init__(self, llm_session: LLMSession) -> None:
         """Initialize CoPilotConversation, message history, and authentication manager."""
         print_env_variables()
-        self.copilot_turn = CoPilotTurn(verbose=False)
+        self.copilot_turn = CoPilotTurn(llm_session=llm_session, verbose=False)
         self.msg_dict = {}  # Dictionary to store message deques per user
         self.auth_manager = AuthenticationManager()
+        self.llm_session = llm_session
 
     def manage_conv_history(self, user_id: str, message: dict) -> None:
         """Append a message to the user's message history."""
@@ -79,7 +80,7 @@ def _log_message_history(self) -> None:
         for user_id, messages in self.msg_dict.items():
             logger.info(f'[internal control word] [msg_dict audit]: user "{user_id}" msg_list length is {len(messages)}')
 
-    def perform_operation(self, in_parameters: InParameters, llm_session: LLMSession) -> OutParameters:
+    def perform_operation(self, in_parameters: InParameters) -> OutParameters:
         """Main entry for performing an operation. Delegates to helpers for clarity."""
         logger.info('[CoPilot]: New Chat Round Started')
         user_prompt = in_parameters.user
@@ -106,13 +107,13 @@ def perform_operation(self, in_parameters: InParameters, llm_session: LLMSession
                 self.auth_manager.set_authenticate_state(username, rest_token)
                 if not self.auth_manager.is_authenticated(username):
                     logger.error(f'User {username} failed authentication twice. Aborting operation.')
-                    result = self._handle_authenticate_failure(user_id, conv_id, turn_id, llm_session)
+                    result = self._handle_authenticate_failure(user_id, conv_id, turn_id)
                 else:
                     logger.info(f'User {username} authenticated successfully.')
-                    result = self._handle_user_question(user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info, llm_session)
+                    result = self._handle_user_question(user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info)
             else:
                 logger.info(f'User {username} authenticated successfully.')
-                result = self._handle_user_question(user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info, llm_session)
+                result = self._handle_user_question(user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info)
         else:
             result = self._handle_empty_input()
 
@@ -164,29 +165,29 @@ def _handle_empty_input(self):
         out_parameters = OutParameters(resp)
         return out_parameters
 
-    def _handle_authenticate_failure(self, user_id, conv_id, turn_id, llm_session: LLMSession = None):
+    def _handle_authenticate_failure(self, user_id, conv_id, turn_id):
         """Handle authentication failure case."""
         logger.info('User authentication failed. Aborting operation.')
         resp = self._make_skip_response(user_id, conv_id, turn_id, 'error')
         out_parameters = OutParameters(resp)
 
         # If LLM session is available, set up thread-local context and use push_frontend_event
-        if llm_session:
+        if self.llm_session:
             from .utils.push_frontend import set_thread_llm_session, push_frontend_event
-            set_thread_llm_session(llm_session)
+            set_thread_llm_session(self.llm_session)
             error_message = '<span class="text-gray-400 italic">Unauthorized - Authentication failed</span>'
             push_frontend_event(error_message)
         
         return out_parameters
 
-    def _handle_user_question(self, user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info, llm_session: LLMSession):
+    def _handle_user_question(self, user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info):
         """Handle the case where only a user question is provided."""
         if user_id not in self.msg_dict:
             self.msg_dict[user_id] = deque(maxlen=HISTORY_DEPTH)
         msg_user = {'role': 'user', 'content': user_prompt}
         self.manage_conv_history(user_id, msg_user)
         logger.info(f'[internal control word] [per user check] user "{user_id}" msg_list length is {len(self.msg_dict[user_id])}')
-        resp = self.copilot_turn.process_turn(self.msg_dict[user_id], skip_summary, debugging, llm_session)
+        resp = self.copilot_turn.process_turn(self.msg_dict[user_id], skip_summary, debugging)
         if not isinstance(resp, dict):
             logger.info('Unexpected response format from copilot.process_turn')
             return self.handle_unexpected_copilot_response(user_id, conv_id, turn_id)
diff --git a/src/copilot-chat/src/copilot_agent/copilot_service.py b/src/copilot-chat/src/copilot_agent/copilot_service.py
index 227aeb35..3a6049e3 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_service.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_service.py
@@ -48,11 +48,11 @@ def status(self):
         """GET endpoint for health/status check."""
         return jsonify({"status": "running"})
 
-    def get_or_create_session(self, user_id, conv_id):
+    def get_or_create_session(self, user_id, conv_id, llm_session):
         """Retrieve or create a copilot_conversation for the given userId and convId."""
         session_key = f"{user_id}_{conv_id}"
         if session_key not in self.sessions:
-            self.sessions[session_key] = CoPilotConversation()
+            self.sessions[session_key] = CoPilotConversation(llm_session)
         return self.sessions[session_key]
 
     def instance_operation(self):
@@ -62,11 +62,12 @@ def instance_operation(self):
             data = request.get_json()
             user_id = data['data']['messageInfo']['userId']
             conv_id = data['data']['messageInfo']['convId']
-            copilot_conversation = self.get_or_create_session(user_id, conv_id)
-
             llm_session = LLMSession()
+            copilot_conversation = self.get_or_create_session(user_id, conv_id, llm_session)
+
+            
             in_parameters = copilot_conversation.build_in_parameters(data)
-            out_parameters = copilot_conversation.perform_operation(in_parameters, llm_session=llm_session)
+            out_parameters = copilot_conversation.perform_operation(in_parameters)
             response = {
                 "status": "success",
                 "data": out_parameters.__dict__
@@ -84,11 +85,26 @@ def stream_operation(self):
         forwarded to the HTTP response. This avoids changing many internal call chains.
         """
         logger.info("Received request at /copilot/api/operation/stream")
+
+        # Create queue BEFORE the callback function
+        q = queue.Queue()
+
+        def on_chunk(chunk: str):
+            # put chunk into queue for streaming response
+            q.put(chunk)
+        
         try:
             data = request.get_json()
             user_id = data['data']['messageInfo']['userId']
             conv_id = data['data']['messageInfo']['convId']
-            copilot_conversation = self.get_or_create_session(user_id, conv_id)
+            llm_session = LLMSession()  # Create a new LLM session
+            llm_session.set_instance_stream_callback(on_chunk)
+            copilot_conversation = self.get_or_create_session(user_id, conv_id, llm_session)  # Pass llm_session
+            
+            # CRITICAL: Update the llm_session in the conversation for subsequent requests
+            # The copilot_turn needs to use the NEW llm_session with the NEW callback
+            copilot_conversation.llm_session = llm_session
+            copilot_conversation.copilot_turn.llm_session = llm_session
         except KeyError as e:
             logger.error(f"Missing key in JSON body for stream_operation: {e}")
             return jsonify({"status": "error", "message": f"Missing key: {e}"}), 400
@@ -96,22 +112,14 @@ def stream_operation(self):
             logger.error(f"Failed to parse JSON body for stream_operation: {e}")
             return jsonify({"status": "error", "message": "invalid json"}), 400
 
-        q = queue.Queue()
 
-        def on_chunk(chunk: str):
-            # put chunk into queue for streaming response
-            q.put(chunk)
 
         def worker():
-            # Create a dedicated LLM session for this request with per-instance callback
+            # Use the llm_session from the copilot_conversation
             try:
                 in_parameters = copilot_conversation.build_in_parameters(data)
-                # Create a fresh LLM session for this streaming request
-                llm_session = LLMSession()
-                llm_session.set_instance_stream_callback(on_chunk)
-
-                # Pass the dedicated session to the conversation
-                result = copilot_conversation.perform_operation(in_parameters, llm_session=llm_session)
+                # Reuse the llm_session passed to the conversation
+                copilot_conversation.perform_operation(in_parameters)
             except Exception as e:
                 logger.error(f"Error during streaming operation worker: {e}")
                 import traceback
diff --git a/src/copilot-chat/src/copilot_agent/copilot_turn.py b/src/copilot-chat/src/copilot_agent/copilot_turn.py
index 1d0e1cd3..ebc39b9f 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_turn.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_turn.py
@@ -11,10 +11,10 @@
 from .config import COPILOT_VERSION, DATA_DIR, PROMPT_DIR
 from .ltp import ltp_auto_reject, ltp_human_intervention, query_metadata, query_metrics, query_user_manual, query_powerbi
 from .utils import (
+    Contextualizer,
     LLMSession,
     LTPReportProcessor,
     QuestionClassifier,
-    contextualize_question,
     gen_smart_help,
     get_prompt_from,
     push_frontend_event,
@@ -24,16 +24,20 @@
 class CoPilotTurn:
     """CoPilot Turn, handles each inquiry/response turn."""
 
-    def __init__(self, verbose: bool = False) -> None:
+    def __init__(self, llm_session: LLMSession, verbose: bool = False) -> None:
         """Initialize."""
         self.verbose = verbose
+        self.llm_session = llm_session
         # Initialize version
         self._version = self._initialize_version()
         # Load help message
         self.help_msg = self.load_help_message()
+        self.system_prompt_answer_general = get_prompt_from(os.path.join(PROMPT_DIR, 'gen_smart_help_prompt_general.txt'))
+        self.classifier = QuestionClassifier(self._version, self.llm_session)
+        self.contextualizer = Contextualizer(self.llm_session)
 
     # entry function, processes the list of messages and returns a dictionary with the results
-    def process_turn(self, messages_list: list, skip_summary: bool = False, debugging: bool = False, llm_session: LLMSession = None) -> dict:
+    def process_turn(self, messages_list: list, skip_summary: bool = False, debugging: bool = False) -> dict:
         """Process the list of messages and return a dictionary with the results."""
         if debugging:
             logger.info(f'DEBUGGING: {debugging}')
@@ -41,56 +45,55 @@ def process_turn(self, messages_list: list, skip_summary: bool = False, debuggin
 
         # Set thread-local session for push_frontend functions to use correct callback
         from .utils.push_frontend import set_thread_llm_session
-        set_thread_llm_session(llm_session)
-        
-        classifier = QuestionClassifier(self._version, llm_session)
+        set_thread_llm_session(self.llm_session)
 
         # get contextualized question from this and last user inquiry
         push_frontend_event('<span class="text-gray-400 italic">🤔 Copilot is understanding your request...</span><br/>', replace=False)
         this_inquiry = messages_list[-1]['content']
         last_inquiry = messages_list[-3]['content'] if len(messages_list) > 2 else None
-        question = contextualize_question(this_inquiry, last_inquiry, llm_session)
+        question = self.contextualizer.contextualize(this_inquiry, last_inquiry)
 
         # classify the question to determine the solution source and method
         push_frontend_event('<span class="text-gray-400 italic">🔍 Copilot is finding the right the data source...</span><br/>', replace=False)
-        question_type = classifier.classify_question(question)
+        question_type = self.classifier.classify_question(question)
         # objective, concern in the question
         obj, con = question_type.get('lv0_object', '3. [general]'), question_type.get('lv1_concern', '0. [others]')
 
         # verion f3, resolves objective 8 (Lucia Training Platform)
         if self._version == 'f3':
             if obj.count('8') > 0:
-                answer, debug = self.query_ltp(question, con, skip_summary, llm_session)
+                answer, debug = self.query_ltp(question, con, skip_summary)
             elif obj.count('3') > 0:
-                answer = self.gen_smart_help_general(question, llm_session)
+                answer = self.gen_answer_general(question)
                 debug = {}
             elif obj.count('9') > 0:
                 help_keys = ['feature']
-                answer = gen_smart_help(self.help_msg, question, help_keys, llm_session=llm_session)
+                answer = gen_smart_help(self.help_msg, question, help_keys, True, self.llm_session)
+                
                 debug = {}
             else:
                 help_keys = ['unsupported_question']
-                answer = gen_smart_help(self.help_msg, question, help_keys, llm_session=llm_session)
+                answer = gen_smart_help(self.help_msg, question, help_keys, True, self.llm_session)
                 debug = {}
         else:
             # Placeholder for other version implementations
             help_keys = ['unsupported_question']
-            answer = gen_smart_help(self.help_msg, question, help_keys, llm_session=llm_session)
+            answer = gen_smart_help(self.help_msg, question, help_keys, True, self.llm_session)
             debug = {}
         
         return {'category': question_type, 'answer': answer, 'debug': debug}
 
-    def query_ltp(self, question: str, con: str, skip_summary: bool, llm_session: LLMSession) -> tuple[str, dict]:
+    def query_ltp(self, question: str, con: str, skip_summary: bool) -> tuple[str, dict]:
         """Query about Lucia Training Platform."""
         # Mapping concern codes to handler functions  
         # Updated to pass llm_session to prevent singleton blocking
         handlers = {
-            '1': lambda: query_metrics(question, self.help_msg, skip_summary, llm_session),
-            '2': lambda: query_metadata(question, self.help_msg, skip_summary, llm_session),
-            '3': lambda: query_user_manual(question, self.help_msg, llm_session),
-            '4': lambda: query_powerbi(question, self.help_msg, llm_session),
-            '5': lambda: ltp_auto_reject(question, self.help_msg, llm_session),
-            '6': lambda: ltp_human_intervention(question, self.help_msg, llm_session),
+            '1': lambda: query_metrics(question, self.help_msg, skip_summary, self.llm_session),
+            '2': lambda: query_metadata(question, self.help_msg, skip_summary, self.llm_session),
+            '3': lambda: query_user_manual(question, self.help_msg, self.llm_session),
+            '4': lambda: query_powerbi(question, self.help_msg, self.llm_session),
+            '5': lambda: ltp_auto_reject(question, self.help_msg, self.llm_session),
+            '6': lambda: ltp_human_intervention(question, self.help_msg, self.llm_session),
         }
         for code, handler in handlers.items():
             if con.count(code) > 0:
@@ -98,13 +101,13 @@ def query_ltp(self, question: str, con: str, skip_summary: bool, llm_session: LL
         return 'unsupported concern.', {}
 
     # generate generic smart help message based on user input
-    def gen_smart_help_general(self, question: str, llm_session: LLMSession) -> str:
+    def gen_answer_general(self, question: str) -> str:
         """Generate smart help message based on user input."""
-        system_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'gen_smart_help_prompt_general.txt'))
+        system_prompt = self.system_prompt_answer_general
         if isinstance(self.help_msg, dict) and 'feature' in self.help_msg:
             system_prompt = system_prompt + '\n\n' + self.help_msg['feature']
         push_frontend_event('<span class="text-gray-400 italic">🌐 Accessing public information...</span><br/>', replace=False)
-        summary = llm_session.try_stream_fallback_chat(system_prompt, f'question is: {question}')
+        summary = self.llm_session.try_stream_fallback_chat(system_prompt, f'question is: {question}')
         return summary
 
     def get_preload_dashboard(self):
diff --git a/src/copilot-chat/src/copilot_agent/utils/__init__.py b/src/copilot-chat/src/copilot_agent/utils/__init__.py
index c2bb0d88..313af6dc 100644
--- a/src/copilot-chat/src/copilot_agent/utils/__init__.py
+++ b/src/copilot-chat/src/copilot_agent/utils/__init__.py
@@ -5,7 +5,7 @@
 
 from .authentication import AuthenticationManager
 from .classify import QuestionClassifier
-from .conversation_manager import contextualize_question
+from .conversation_manager import Contextualizer
 from .dcw import dcw_parser, extract_dcw_from_history, gen_dcw, parse_and_align_dcw
 from .kql_executor import KustoExecutor
 from .logger import (
@@ -66,6 +66,7 @@
 __all__ = [
     'DCW',
     'AuthenticationManager',
+    'Contextualizer',
     'Customer',
     'Design',
     'KustoExecutor',
@@ -76,7 +77,6 @@
     'QueryGeneratorRAG',
     'SQLManager',
     'RestAPIClient',
-    'contextualize_question',
     'dcw_parser',
     'execute_openpai_query',
     'execute_promql_query',
diff --git a/src/copilot-chat/src/copilot_agent/utils/classify.py b/src/copilot-chat/src/copilot_agent/utils/classify.py
index d52c7a1b..d14686f8 100644
--- a/src/copilot-chat/src/copilot_agent/utils/classify.py
+++ b/src/copilot-chat/src/copilot_agent/utils/classify.py
@@ -14,6 +14,12 @@ class QuestionClassifier:
     def __init__(self, version, model):
         self.version = version
         self.model = model
+        if self.version == 'f3':
+            self.lv0_system_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'classification/f3/lv0.txt'))
+            self.lv1_system_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'classification/f3/lv1.txt'))
+        else:
+            self.lv0_system_prompt = None
+            self.lv1_system_prompt = None
 
     def classify_question(self, question: str) -> dict:
         """Classify the question and return a dictionary with the results."""
@@ -31,17 +37,15 @@ def classify_question(self, question: str) -> dict:
     def classifier_lv0(self, question: str) -> str:
         """Classify the user question into several categories."""
         if self.version == 'f3':
-            sys_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'classification/f3/lv0.txt'))
+            resp = self.model.chat(self.lv0_system_prompt, question)
         else:
-            raise ValueError(f'Unsupported version: {self.version}')
-        resp = self.model.chat(sys_prompt, question)
+            resp = '3'  # default to 3
         return resp
 
     def classifier_lv1(self, question: str) -> str:
         """Classify the user question into several categories."""
         if self.version == 'f3':
-            sys_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'classification/f3/lv1.txt'))
-            resp = self.model.chat(sys_prompt, question)
+            resp = self.model.chat(self.lv1_system_prompt, question)
         else:
             resp = '0'  # default to 0
         return resp
diff --git a/src/copilot-chat/src/copilot_agent/utils/conversation_manager.py b/src/copilot-chat/src/copilot_agent/utils/conversation_manager.py
index 0f6a275e..e48289b5 100644
--- a/src/copilot-chat/src/copilot_agent/utils/conversation_manager.py
+++ b/src/copilot-chat/src/copilot_agent/utils/conversation_manager.py
@@ -15,20 +15,25 @@
 from ..utils.llmsession import LLMSession
 from ..utils.utils import get_prompt_from, extract_json_dict
 
-def contextualize_question(question: str, last_question: str | None, llm_session: LLMSession) -> str:
+class Contextualizer:
     """Contextualizes the current question based on the last question."""
-    logger.info(f"Contextualizing question: '{question}' based on last question: '{last_question}'")
-    if last_question is None:
-        return question
-    else:
-        contextualize_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'dialouge_state_tracking', 'dst.txt'))
-        user_prompt = str({
-            'this_question': question,
-            'last_question': last_question,
-        })
-        new_question_str = llm_session.chat(contextualize_prompt, user_prompt)
-        new_question_dict = extract_json_dict(new_question_str, nested=False)
-        if isinstance(new_question_dict, dict):
-            new_question = new_question_dict.get('new_question', question)
-        logger.info(f"Return: '{new_question}'")
-        return new_question
\ No newline at end of file
+    def __init__(self, llm_session: LLMSession):
+        self.llm_session = llm_session
+        self.contextualize_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'dialouge_state_tracking', 'dst.txt'))
+
+    def contextualize(self, question: str, last_question: str | None) -> str:
+        """Contextualizes the current question based on the last question."""
+        logger.info(f"Contextualizing question: '{question}' based on last question: '{last_question}'")
+        if last_question is None:
+            return question
+        else:
+            user_prompt = str({
+                'this_question': question,
+                'last_question': last_question,
+            })
+            new_question_str = self.llm_session.chat(self.contextualize_prompt, user_prompt)
+            new_question_dict = extract_json_dict(new_question_str, nested=False)
+            if isinstance(new_question_dict, dict):
+                new_question = new_question_dict.get('new_question', question)
+            logger.info(f"Return: '{new_question}'")
+            return new_question

From 9f8a8b0877fc638f2a2f299ab64710f5161aa94b Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Thu, 16 Oct 2025 17:09:54 -0700
Subject: [PATCH 15/34] code refactor: stage 4, change ltp into a class for
 easier state management

---
 .../src/copilot_agent/copilot_turn.py         |  20 +-
 .../src/copilot_agent/ltp/__init__.py         |   9 +-
 src/copilot-chat/src/copilot_agent/ltp/ltp.py | 491 +++++++++---------
 .../src/copilot_agent/utils/__init__.py       |   2 +
 4 files changed, 265 insertions(+), 257 deletions(-)

diff --git a/src/copilot-chat/src/copilot_agent/copilot_turn.py b/src/copilot-chat/src/copilot_agent/copilot_turn.py
index ebc39b9f..b77184c5 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_turn.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_turn.py
@@ -9,7 +9,7 @@
 from .utils.logger import logger
 
 from .config import COPILOT_VERSION, DATA_DIR, PROMPT_DIR
-from .ltp import ltp_auto_reject, ltp_human_intervention, query_metadata, query_metrics, query_user_manual, query_powerbi
+from .ltp import LTP
 from .utils import (
     Contextualizer,
     LLMSession,
@@ -18,6 +18,7 @@
     gen_smart_help,
     get_prompt_from,
     push_frontend_event,
+    set_thread_llm_session
 )
 
 
@@ -35,6 +36,7 @@ def __init__(self, llm_session: LLMSession, verbose: bool = False) -> None:
         self.system_prompt_answer_general = get_prompt_from(os.path.join(PROMPT_DIR, 'gen_smart_help_prompt_general.txt'))
         self.classifier = QuestionClassifier(self._version, self.llm_session)
         self.contextualizer = Contextualizer(self.llm_session)
+        self.processor = LTP(self.llm_session, False)
 
     # entry function, processes the list of messages and returns a dictionary with the results
     def process_turn(self, messages_list: list, skip_summary: bool = False, debugging: bool = False) -> dict:
@@ -44,7 +46,6 @@ def process_turn(self, messages_list: list, skip_summary: bool = False, debuggin
             return {'category': None, 'answer': 'DEBUGGING MODE ENABLED', 'debug': {'debugging': debugging}}
 
         # Set thread-local session for push_frontend functions to use correct callback
-        from .utils.push_frontend import set_thread_llm_session
         set_thread_llm_session(self.llm_session)
 
         # get contextualized question from this and last user inquiry
@@ -52,10 +53,12 @@ def process_turn(self, messages_list: list, skip_summary: bool = False, debuggin
         this_inquiry = messages_list[-1]['content']
         last_inquiry = messages_list[-3]['content'] if len(messages_list) > 2 else None
         question = self.contextualizer.contextualize(this_inquiry, last_inquiry)
+        #question = this_inquiry
 
         # classify the question to determine the solution source and method
         push_frontend_event('<span class="text-gray-400 italic">🔍 Copilot is finding the right the data source...</span><br/>', replace=False)
         question_type = self.classifier.classify_question(question)
+        #question_type = {'lv0_object': '3', 'lv1_concern': '0'}
         # objective, concern in the question
         obj, con = question_type.get('lv0_object', '3. [general]'), question_type.get('lv1_concern', '0. [others]')
 
@@ -85,15 +88,16 @@ def process_turn(self, messages_list: list, skip_summary: bool = False, debuggin
 
     def query_ltp(self, question: str, con: str, skip_summary: bool) -> tuple[str, dict]:
         """Query about Lucia Training Platform."""
+        self.processor.llm_session = self.llm_session  # ensure processor uses the current llm_session
         # Mapping concern codes to handler functions  
         # Updated to pass llm_session to prevent singleton blocking
         handlers = {
-            '1': lambda: query_metrics(question, self.help_msg, skip_summary, self.llm_session),
-            '2': lambda: query_metadata(question, self.help_msg, skip_summary, self.llm_session),
-            '3': lambda: query_user_manual(question, self.help_msg, self.llm_session),
-            '4': lambda: query_powerbi(question, self.help_msg, self.llm_session),
-            '5': lambda: ltp_auto_reject(question, self.help_msg, self.llm_session),
-            '6': lambda: ltp_human_intervention(question, self.help_msg, self.llm_session),
+            '1': lambda: self.processor.query_metrics(question, self.help_msg, skip_summary),
+            '2': lambda: self.processor.query_metadata(question, self.help_msg, skip_summary),
+            '3': lambda: self.processor.query_user_manual(question, self.help_msg),
+            '4': lambda: self.processor.query_powerbi(question, self.help_msg),
+            '5': lambda: self.processor.auto_reject(question, self.help_msg),
+            '6': lambda: self.processor.human_intervention(question, self.help_msg),
         }
         for code, handler in handlers.items():
             if con.count(code) > 0:
diff --git a/src/copilot-chat/src/copilot_agent/ltp/__init__.py b/src/copilot-chat/src/copilot_agent/ltp/__init__.py
index 848910cc..52d55d16 100644
--- a/src/copilot-chat/src/copilot_agent/ltp/__init__.py
+++ b/src/copilot-chat/src/copilot_agent/ltp/__init__.py
@@ -3,13 +3,8 @@
 
 """Init file for LTP module."""
 
-from .ltp import ltp_auto_reject, ltp_human_intervention, query_metadata, query_metrics, query_user_manual, query_powerbi
+from .ltp import LTP
 
 __all__ = [
-    'ltp_auto_reject',
-    'ltp_human_intervention',
-    'query_metadata',
-    'query_metrics',
-    'query_user_manual',
-    'query_powerbi',
+    'LTP'
 ]
diff --git a/src/copilot-chat/src/copilot_agent/ltp/ltp.py b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
index bb29418d..dd4e2e86 100644
--- a/src/copilot-chat/src/copilot_agent/ltp/ltp.py
+++ b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
@@ -24,248 +24,255 @@
 from ..utils.utils import get_prompt_from
 from .ltp_dashboard import query_generation_kql
 
-SUB_FEATURE = 'ltp'
-
-SKIP_LUCIA_CONTROLLER_EXECUTION = True
-
-# session: query cluster or job metrics from Prometheus REST API
-def query_metrics(question: str, help_msg, skip_summary: bool = False, llm_session=None):
-    """Query cluster or job metrics from Prometheus backend."""
-    
-    if SKIP_LUCIA_CONTROLLER_EXECUTION:
-        logger.info('Skipping PromQL query generation.')
-        query, end_time_stamp, parallel, param = None, None, None, {'scrape_interval': None, 'time_offset': None}
-        logger.info('Skipping PromQL query execution.')
-        resp = f'Skipping PromQL query execution due to lack of support, this will be enabled in the next release.'
-        resp_parser_param = resp
-        value = {'result': resp}
-    else:
+
+class LTP:
+    """LTP Query Engine for handling various LTP-related queries."""
+
+    SUB_FEATURE = 'ltp'
+    SKIP_LUCIA_CONTROLLER_EXECUTION = True
+
+    def __init__(self, llm_session: LLMSession = None, skip_execution: bool = True) -> None:
+        """
+        Initialize LTP Query Engine.
+        
+        Args:
+            llm_session: LLM session for generating queries and summaries
+            skip_execution: Whether to skip actual query execution (for testing/development)
+        """
+        self.llm_session = llm_session
+        self.skip_execution = skip_execution
+
+    def query_metrics(self, question: str, help_msg, skip_summary: bool = False):
+        """Query cluster or job metrics from Prometheus backend."""
+        
+        if self.skip_execution:
+            logger.info('Skipping PromQL query generation.')
+            query, end_time_stamp, parallel, param = None, None, None, {'scrape_interval': None, 'time_offset': None}
+            logger.info('Skipping PromQL query execution.')
+            resp = f'Skipping PromQL query execution due to lack of support, this will be enabled in the next release.'
+            resp_parser_param = resp
+            value = {'result': resp}
+        else:
+            # generate query
+            logger.info('Generating Query: LTP, Metrics')
+            query, end_time_stamp, parallel, param = gen_promql_query(self.SUB_FEATURE, question, self.llm_session)
+
+            if not query:
+                logger.info(f'No query found in the response, query is {query}')
+                answer = 'Query generation failed, no query found.'
+                return answer, None
+            # execute
+            logger.info('Executing Query: LTP, Metrics')
+            if not parallel:
+                resp = execute_promql_query(query, {})
+            else:
+                resp = execute_promql_query_step(query, {}, end_time_stamp, f"{param['time_offset']}")
+            resp_parser_param, value = retrive_promql_response_value(resp, param)
+
+        # generate answer
+        logger.info('Generating Answer: LTP, Metrics')
+        summary = gen_summary(
+            self.SUB_FEATURE,
+            {'cluster_job_metrics': value},
+            {'cluster_job_metrics': value},
+            question,
+            'gen_result_summary_metrics.txt',
+            None,
+            help_msg,
+            skip_summary,
+            self.llm_session
+        )
+
+        # generate additional info dict
+        info_dict = {}
+        info_dict['query_promql'] = query
+        info_dict['query_param_promql'] = param
+        info_dict['resp_promql'] = resp
+        info_dict['value_promql'] = value
+        info_dict['__debug:f1_query'] = query
+        info_dict['__debug:f2_execution_method'] = parallel
+        info_dict['__debug:f3_scrape_interval_in_execution'] = param.get('scrape_interval', None)
+        info_dict['__debug:f4_resp_parser_param'] = resp_parser_param
+        info_dict['__debug:r0_type_resp_promql'] = type(resp).__name__
+
+        return summary, info_dict
+
+    def query_metadata(self, question: str, help_msg, skip_summary: bool = False):
+        """Query job metadata from OpenPAI backend."""
         # generate query
-        logger.info('Generating Query: LTP, Metrics')
-        query, end_time_stamp, parallel, param = gen_promql_query(SUB_FEATURE, question, llm_session)
-
-        if not query:
-            logger.info(f'No query found in the response, query is {query}')
-            answer = 'Query generation failed, no query found.'
-            return answer, None
-        # execute
-        logger.info('Executing Query: LTP, Metrics')
-        if not parallel:
-            resp = execute_promql_query(query, {})
+        logger.info('Generating Query: LTP, Metadata')
+        query = 'restserver/jobs?offset=0&limit=49999&withTotalCount=true&order=completionTime'
+
+        if self.skip_execution:
+            logger.info('Skipping job metadata query execution.')
+            resp = f'Skipping job metadata query execution due to lack of support, this will be enabled in the next release.'
+            job_metadata = {'result': resp}
+            job_metadata_brief = {'result': resp}
         else:
-            resp = execute_promql_query_step(query, {}, end_time_stamp, f"{param['time_offset']}")
-        resp_parser_param, value = retrive_promql_response_value(resp, param)
-
-    # generate answer
-    logger.info('Generating Answer: LTP, Metrics')
-    summary = gen_summary(
-        SUB_FEATURE,
-        {'cluster_job_metrics': value},
-        {'cluster_job_metrics': value},
-        question,
-        'gen_result_summary_metrics.txt',
-        None,
-        help_msg,
-        skip_summary,
-        llm_session
-    )
-
-    # generate additional info dict
-    info_dict = {}
-    info_dict['query_promql'] = query
-    info_dict['query_param_promql'] = param
-    info_dict['resp_promql'] = resp
-    info_dict['value_promql'] = value
-    info_dict['__debug:f1_query'] = query
-    info_dict['__debug:f2_execution_method'] = parallel
-    info_dict['__debug:f3_scrape_interval_in_execution'] = param.get('scrape_interval', None)
-    info_dict['__debug:f4_resp_parser_param'] = resp_parser_param
-    info_dict['__debug:r0_type_resp_promql'] = type(resp).__name__
-
-    return summary, info_dict
-
-
-# session: query job metadata from OpenPAI backend
-def query_metadata(question: str, help_msg, skip_summary: bool = False, llm_session=None):
-    """Query job metadata from OpenPAI backend."""
-    # generate query
-    logger.info('Generating Query: LTP, Metadata')
-    query = 'restserver/jobs?offset=0&limit=49999&withTotalCount=true&order=completionTime'
-
-    if SKIP_LUCIA_CONTROLLER_EXECUTION:
-        logger.info('Skipping job metadata query execution.')
-        resp = f'Skipping job metadata query execution due to lack of support, this will be enabled in the next release.'
-        job_metadata = {'result': resp}
-        job_metadata_brief = {'result': resp}
-    else:
-        # extract
-        logger.info('Executing Query: LTP, Metadata')
-        resp = execute_openpai_query(query, {})
-        job_metadata = extract_job_metadata(resp)
-        job_metadata_brief = get_brief_job_metadata(resp)
-
-    # generate answer
-    logger.info('Generating Answer: LTP, Metadata')
-    summary = gen_summary(
-        SUB_FEATURE,
-        {'job_metadata': job_metadata},
-        {'job_metadata_only_the_first_1000_jobs': job_metadata_brief},
-        question,
-        'gen_result_summary_metadata.txt',
-        None,
-        help_msg,
-        skip_summary,
-        llm_session
-    )
-
-    # generate additional info dict
-    info_dict = {}
-    info_dict['query_promql'] = query
-    info_dict['resp_brief_promql'] = job_metadata_brief
-    return summary, info_dict
-
-
-# session: query user manual from LTP documentation
-def query_user_manual(question: str, help_msg, llm_session=None):
-    """Query user manual."""
-    # read documentation
-    documentation = get_prompt_from(os.path.join(PROMPT_DIR, SUB_FEATURE, 'ltp_documentation.txt'))
-    ltp_doc = {'lucia training platform documentation': documentation}
-
-    # generate answer
-    logger.info('Generating Answer: LTP, User Manual')
-    summary = gen_summary(SUB_FEATURE, ltp_doc, None, question, 'gen_result_summary_doc.txt', None, help_msg, llm_session=llm_session)
-
-    info_dict = {}
-    return summary, info_dict
-
-
-def extract_job_metadata(resp):
-    """Extract job metadata from OpenPAI response."""
-    if isinstance(resp, dict) and 'data' in resp:
-        resp_data = resp['data']
-        job_metadatas = {
-            f'{job["username"]}~{job["name"]}': {
-                k: v for k, v in job.items() if k not in ['debugId', 'subState', 'executionType', 'appExitCode']
-            }
-            for job in resp_data
-        }
-    else:
-        job_metadatas = None
-    return job_metadatas
-
-
-def get_brief_job_metadata(resp):
-    """Get brief job metadata."""
-    if isinstance(resp, dict) and 'data' in resp:
-        resp_data = resp['data']
-        job_metadatas = [f'{job["username"]}~{job["name"]}' for job in resp_data]
-        job_metadatas = job_metadatas[:1000]
-    else:
-        job_metadatas = None
-    return job_metadatas
-
-
-def query_powerbi(question: str, help_msg, llm_session=None):
-    """Query PowerBI data."""
-
-    # send HTML snippet so frontend (with rehype-raw enabled) can render with Tailwind styling
-    push_frontend_event('<span class="text-gray-400 italic">✍️ Copilot is crafting the query...</span><br/>', replace=False)
-    query_gen_res, query_gen_status = query_generation_kql(question)
-    logger.info(f'KQL Query generation result: {query_gen_res}, status: {query_gen_status}')
-    k_cluster = os.environ.get('DATA_SRC_KUSTO_CLUSTER_URL', '')
-    k_db = os.environ.get('DATA_SRC_KUSTO_DATABASE_NAME', '')
-    k_table = ''
-    if query_gen_status == 0:
-        push_frontend_event('<span class="text-gray-400 italic">📥 Copilot is collecting the information...</span><br/>', replace=False)
-        KQL = KustoExecutor(k_cluster, k_db, k_table)
-        # Replace placeholders
-        query_gen_res = query_gen_res.format(
-            cluster_url=k_cluster,
-            database_name=k_db
+            # extract
+            logger.info('Executing Query: LTP, Metadata')
+            resp = execute_openpai_query(query, {})
+            job_metadata = self._extract_job_metadata(resp)
+            job_metadata_brief = self._get_brief_job_metadata(resp)
+
+        # generate answer
+        logger.info('Generating Answer: LTP, Metadata')
+        summary = gen_summary(
+            self.SUB_FEATURE,
+            {'job_metadata': job_metadata},
+            {'job_metadata_only_the_first_1000_jobs': job_metadata_brief},
+            question,
+            'gen_result_summary_metadata.txt',
+            None,
+            help_msg,
+            skip_summary,
+            self.llm_session
         )
-        response, response_status = KQL.execute_return_data(query_gen_res)
-        response_long = {"query_generated": query_gen_res, "response_from_query_execution": response}
-        logger.info(f'Kusto Query execution result: {response}')
-    else:
-        response = {}
-        response_long = {"query_generated": "query generation failed, please perform manual investigation", "response_from_query_execution": response}
-        response_status = -1
-
-    logger.info('Generating Answer: LTP, Dashboard')
-    summary = gen_summary(
-        SUB_FEATURE,
-        response_long,
-        response,
-        question,
-        'gen_result_summary_dashboard.txt',
-        None,
-        help_msg,
-        False,
-        llm_session
-    )
-
-    if response_status == 0:
-        reference = f'\n\n >Reference: the generated KQL query used to get the data:\n\n```\n{query_gen_res}\n```'
-        # push to frontend
-        push_frontend_event(reference)
-
-    info_dict = {}
-    info_dict["s0_query_gen"] = {"res": query_gen_res, "status": query_gen_status}
-    info_dict["s1_query_exe"] = {"res": make_json_serializable(response), "status": response_status}
-    return summary, info_dict
-
-
-def ltp_auto_reject(question: str, help_msg, llm_session=None):
-    """Auto rejected, unsupported by design."""
-
-    logger.info('Generating Answer: LTP, Auto Rejection')
-    summary = gen_summary(
-        SUB_FEATURE,
-        {},
-        {},
-        question,
-        'gen_result_summary_rejection.txt',
-        None,
-        help_msg,
-        False,
-        llm_session
-    )
-
-    info_dict = {}
-    return summary, info_dict
-
-
-def ltp_human_intervention(question: str, help_msg, llm_session=None):
-    """Handle human intervention for LTP auto rejection."""
-
-    logger.info('Generating Answer: LTP, Human Intervention')
-    summary = gen_summary(
-        SUB_FEATURE,
-        {},
-        {},
-        question,
-        'gen_result_summary_human.txt',
-        None,
-        help_msg,
-        False,
-        llm_session
-    )
-
-    info_dict = {}
-    return summary, info_dict
-
-def make_json_serializable(data):
-    """
-    Recursively converts non-JSON serializable objects within a data structure.
-    """
-    if isinstance(data, (list, tuple)):
-        return [make_json_serializable(item) for item in data]
-    elif isinstance(data, dict):
-        return {key: make_json_serializable(value) for key, value in data.items()}
-    elif isinstance(data, datetime.timedelta):
-        # Convert timedelta to total seconds (a float)
-        return data.total_seconds()
-    else:
-        # Return the object as is if it's already serializable
-        return data
\ No newline at end of file
+
+        # generate additional info dict
+        info_dict = {}
+        info_dict['query_promql'] = query
+        info_dict['resp_brief_promql'] = job_metadata_brief
+        return summary, info_dict
+
+    def query_user_manual(self, question: str, help_msg):
+        """Query user manual."""
+        # read documentation
+        documentation = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'ltp_documentation_20250624.txt'))
+        ltp_doc = {'lucia training platform documentation': documentation}
+
+        # generate answer
+        logger.info('Generating Answer: LTP, User Manual')
+        summary = gen_summary(self.SUB_FEATURE, ltp_doc, None, question, 'gen_result_summary_doc.txt', None, help_msg, llm_session=self.llm_session)
+
+        info_dict = {}
+        return summary, info_dict
+
+    def query_powerbi(self, question: str, help_msg):
+        """Query PowerBI data."""
+
+        # send HTML snippet so frontend (with rehype-raw enabled) can render with Tailwind styling
+        push_frontend_event('<span class="text-gray-400 italic">✍️ Copilot is crafting the query...</span><br/>', replace=False)
+        query_gen_res, query_gen_status = query_generation_kql(question)
+        logger.info(f'KQL Query generation result: {query_gen_res}, status: {query_gen_status}')
+        k_cluster = os.environ.get('DATA_SRC_KUSTO_CLUSTER_URL', '')
+        k_db = os.environ.get('DATA_SRC_KUSTO_DATABASE_NAME', '')
+        k_table = ''
+        if query_gen_status == 0:
+            push_frontend_event('<span class="text-gray-400 italic">📥 Copilot is collecting the information...</span><br/>', replace=False)
+            KQL = KustoExecutor(k_cluster, k_db, k_table)
+            # Replace placeholders
+            query_gen_res = query_gen_res.format(
+                cluster_url=k_cluster,
+                database_name=k_db
+            )
+            response, response_status = KQL.execute_return_data(query_gen_res)
+            response_long = {"query_generated": query_gen_res, "response_from_query_execution": response}
+            logger.info(f'Kusto Query execution result: {response}')
+        else:
+            response = {}
+            response_long = {"query_generated": "query generation failed, please perform manual investigation", "response_from_query_execution": response}
+            response_status = -1
+
+        logger.info('Generating Answer: LTP, Dashboard')
+        summary = gen_summary(
+            self.SUB_FEATURE,
+            response_long,
+            response,
+            question,
+            'gen_result_summary_dashboard.txt',
+            None,
+            help_msg,
+            False,
+            self.llm_session
+        )
+
+        if response_status == 0:
+            reference = f'\n\n >Reference: the generated KQL query used to get the data:\n\n```\n{query_gen_res}\n```'
+            # push to frontend
+            push_frontend_event(reference)
+
+        info_dict = {}
+        info_dict["s0_query_gen"] = {"res": query_gen_res, "status": query_gen_status}
+        info_dict["s1_query_exe"] = {"res": self._make_json_serializable(response), "status": response_status}
+        return summary, info_dict
+
+    def auto_reject(self, question: str, help_msg):
+        """Auto rejected, unsupported by design."""
+
+        logger.info('Generating Answer: LTP, Auto Rejection')
+        summary = gen_summary(
+            self.SUB_FEATURE,
+            {},
+            {},
+            question,
+            'gen_result_summary_rejection.txt',
+            None,
+            help_msg,
+            False,
+            self.llm_session
+        )
+
+        info_dict = {}
+        return summary, info_dict
+
+    def human_intervention(self, question: str, help_msg):
+        """Handle human intervention for LTP auto rejection."""
+
+        logger.info('Generating Answer: LTP, Human Intervention')
+        summary = gen_summary(
+            self.SUB_FEATURE,
+            {},
+            {},
+            question,
+            'gen_result_summary_human.txt',
+            None,
+            help_msg,
+            False,
+            self.llm_session
+        )
+
+        info_dict = {}
+        return summary, info_dict
+
+    @staticmethod
+    def _extract_job_metadata(resp):
+        """Extract job metadata from OpenPAI response."""
+        if isinstance(resp, dict) and 'data' in resp:
+            resp_data = resp['data']
+            job_metadatas = {
+                f'{job["username"]}~{job["name"]}': {
+                    k: v for k, v in job.items() if k not in ['debugId', 'subState', 'executionType', 'appExitCode']
+                }
+                for job in resp_data
+            }
+        else:
+            job_metadatas = None
+        return job_metadatas
+
+    @staticmethod
+    def _get_brief_job_metadata(resp):
+        """Get brief job metadata."""
+        if isinstance(resp, dict) and 'data' in resp:
+            resp_data = resp['data']
+            job_metadatas = [f'{job["username"]}~{job["name"]}' for job in resp_data]
+            job_metadatas = job_metadatas[:1000]
+        else:
+            job_metadatas = None
+        return job_metadatas
+
+    @staticmethod
+    def _make_json_serializable(data):
+        """
+        Recursively converts non-JSON serializable objects within a data structure.
+        """
+        if isinstance(data, (list, tuple)):
+            return [LTP._make_json_serializable(item) for item in data]
+        elif isinstance(data, dict):
+            return {key: LTP._make_json_serializable(value) for key, value in data.items()}
+        elif isinstance(data, datetime.timedelta):
+            # Convert timedelta to total seconds (a float)
+            return data.total_seconds()
+        else:
+            # Return the object as is if it's already serializable
+            return data
diff --git a/src/copilot-chat/src/copilot_agent/utils/__init__.py b/src/copilot-chat/src/copilot_agent/utils/__init__.py
index 313af6dc..716a9700 100644
--- a/src/copilot-chat/src/copilot_agent/utils/__init__.py
+++ b/src/copilot-chat/src/copilot_agent/utils/__init__.py
@@ -33,6 +33,7 @@
 )
 from .push_frontend import(
     push_frontend_event,
+    set_thread_llm_session,
 )
 from .query import (
     gen_kusto_query_fallback_pseudo,
@@ -100,4 +101,5 @@
     'push_frontend_event',
     'retry_function',
     'save_to_csv',
+    'set_thread_llm_session',
 ]

From 798d738224646d771796e3242da3ed60c3398dae Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Fri, 17 Oct 2025 15:48:12 -0700
Subject: [PATCH 16/34] code refactor: stage 5, fix chuck accumulation bug

---
 src/copilot-chat/src/copilot_agent/utils/llmsession.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/copilot-chat/src/copilot_agent/utils/llmsession.py b/src/copilot-chat/src/copilot_agent/utils/llmsession.py
index aac17e16..be942898 100644
--- a/src/copilot-chat/src/copilot_agent/utils/llmsession.py
+++ b/src/copilot-chat/src/copilot_agent/utils/llmsession.py
@@ -232,7 +232,7 @@ def stream_chat(self, system_prompt, user_prompt):
                                 cb(full)
                         except Exception:
                             logger.debug('Stream callback failed')
-                        yield full
+                        yield chunk
 
                 # If stream finishes without exception, stop generator normally
                 return

From 173a9b2da1c887ae5811575b15d425c739a66c0a Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Tue, 21 Oct 2025 18:06:18 -0700
Subject: [PATCH 17/34] code refactor: stage 6, smart help into a class

---
 .../src/copilot_agent/copilot_turn.py         | 33 +++++---
 src/copilot-chat/src/copilot_agent/ltp/ltp.py | 39 +++++++--
 .../src/copilot_agent/utils/__init__.py       |  4 +-
 .../src/copilot_agent/utils/smart_help.py     | 82 ++++++++++---------
 .../src/copilot_agent/utils/summary.py        |  5 +-
 5 files changed, 99 insertions(+), 64 deletions(-)

diff --git a/src/copilot-chat/src/copilot_agent/copilot_turn.py b/src/copilot-chat/src/copilot_agent/copilot_turn.py
index b77184c5..6d61080e 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_turn.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_turn.py
@@ -15,10 +15,10 @@
     LLMSession,
     LTPReportProcessor,
     QuestionClassifier,
-    gen_smart_help,
     get_prompt_from,
     push_frontend_event,
-    set_thread_llm_session
+    set_thread_llm_session,
+    SmartHelp
 )
 
 
@@ -36,33 +36,39 @@ def __init__(self, llm_session: LLMSession, verbose: bool = False) -> None:
         self.system_prompt_answer_general = get_prompt_from(os.path.join(PROMPT_DIR, 'gen_smart_help_prompt_general.txt'))
         self.classifier = QuestionClassifier(self._version, self.llm_session)
         self.contextualizer = Contextualizer(self.llm_session)
-        self.processor = LTP(self.llm_session, False)
+        self.processor = LTP(self.llm_session)
+        self.smart_help = SmartHelp(self.help_msg, self.llm_session)
 
     # entry function, processes the list of messages and returns a dictionary with the results
     def process_turn(self, messages_list: list, skip_summary: bool = False, debugging: bool = False) -> dict:
         """Process the list of messages and return a dictionary with the results."""
-        if debugging:
-            logger.info(f'DEBUGGING: {debugging}')
-            return {'category': None, 'answer': 'DEBUGGING MODE ENABLED', 'debug': {'debugging': debugging}}
 
         # Set thread-local session for push_frontend functions to use correct callback
         set_thread_llm_session(self.llm_session)
 
-        # get contextualized question from this and last user inquiry
+        # get from message list
         push_frontend_event('<span class="text-gray-400 italic">🤔 Copilot is understanding your request...</span><br/>', replace=False)
         this_inquiry = messages_list[-1]['content']
         last_inquiry = messages_list[-3]['content'] if len(messages_list) > 2 else None
+
+        # debug only
+        if False:
+            question = this_inquiry
+            question_type = {'lv0_object': 'none', 'lv1_concern': 'none'}
+            answer, debug = self.processor.query_all_in_one(question, self.help_msg, skip_summary)
+            debug = {}
+            return {'category': question_type, 'answer': answer, 'debug': debug}
+
+        # get contextualized question from this and last user inquiry
         question = self.contextualizer.contextualize(this_inquiry, last_inquiry)
-        #question = this_inquiry
 
         # classify the question to determine the solution source and method
         push_frontend_event('<span class="text-gray-400 italic">🔍 Copilot is finding the right the data source...</span><br/>', replace=False)
         question_type = self.classifier.classify_question(question)
-        #question_type = {'lv0_object': '3', 'lv1_concern': '0'}
-        # objective, concern in the question
         obj, con = question_type.get('lv0_object', '3. [general]'), question_type.get('lv1_concern', '0. [others]')
 
         # verion f3, resolves objective 8 (Lucia Training Platform)
+        push_frontend_event('<span class="text-gray-400 italic">⏳ Copilot is processing your inquiry...</span><br/>', replace=False)
         if self._version == 'f3':
             if obj.count('8') > 0:
                 answer, debug = self.query_ltp(question, con, skip_summary)
@@ -71,17 +77,18 @@ def process_turn(self, messages_list: list, skip_summary: bool = False, debuggin
                 debug = {}
             elif obj.count('9') > 0:
                 help_keys = ['feature']
-                answer = gen_smart_help(self.help_msg, question, help_keys, True, self.llm_session)
+                answer = self.smart_help.generate(question, help_keys, True)
                 
                 debug = {}
             else:
                 help_keys = ['unsupported_question']
-                answer = gen_smart_help(self.help_msg, question, help_keys, True, self.llm_session)
+                answer = self.smart_help.generate(question, help_keys, True)
+
                 debug = {}
         else:
             # Placeholder for other version implementations
             help_keys = ['unsupported_question']
-            answer = gen_smart_help(self.help_msg, question, help_keys, True, self.llm_session)
+            answer = self.smart_help.generate(question, help_keys, True)
             debug = {}
         
         return {'category': question_type, 'answer': answer, 'debug': debug}
diff --git a/src/copilot-chat/src/copilot_agent/ltp/ltp.py b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
index dd4e2e86..adeff406 100644
--- a/src/copilot-chat/src/copilot_agent/ltp/ltp.py
+++ b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
@@ -31,21 +31,21 @@ class LTP:
     SUB_FEATURE = 'ltp'
     SKIP_LUCIA_CONTROLLER_EXECUTION = True
 
-    def __init__(self, llm_session: LLMSession = None, skip_execution: bool = True) -> None:
+    def __init__(self, llm_session: LLMSession = None) -> None:
         """
         Initialize LTP Query Engine.
         
         Args:
             llm_session: LLM session for generating queries and summaries
-            skip_execution: Whether to skip actual query execution (for testing/development)
         """
         self.llm_session = llm_session
-        self.skip_execution = skip_execution
+        self.feature_skipped = True
+        self.ltp_documentation = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'ltp_documentation_20250624.txt'))
 
     def query_metrics(self, question: str, help_msg, skip_summary: bool = False):
         """Query cluster or job metrics from Prometheus backend."""
         
-        if self.skip_execution:
+        if self.feature_skipped:
             logger.info('Skipping PromQL query generation.')
             query, end_time_stamp, parallel, param = None, None, None, {'scrape_interval': None, 'time_offset': None}
             logger.info('Skipping PromQL query execution.')
@@ -103,7 +103,7 @@ def query_metadata(self, question: str, help_msg, skip_summary: bool = False):
         logger.info('Generating Query: LTP, Metadata')
         query = 'restserver/jobs?offset=0&limit=49999&withTotalCount=true&order=completionTime'
 
-        if self.skip_execution:
+        if self.feature_skipped:
             logger.info('Skipping job metadata query execution.')
             resp = f'Skipping job metadata query execution due to lack of support, this will be enabled in the next release.'
             job_metadata = {'result': resp}
@@ -137,9 +137,7 @@ def query_metadata(self, question: str, help_msg, skip_summary: bool = False):
 
     def query_user_manual(self, question: str, help_msg):
         """Query user manual."""
-        # read documentation
-        documentation = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'ltp_documentation_20250624.txt'))
-        ltp_doc = {'lucia training platform documentation': documentation}
+        ltp_doc = {'lucia training platform documentation': self.ltp_documentation}
 
         # generate answer
         logger.info('Generating Answer: LTP, User Manual')
@@ -276,3 +274,28 @@ def _make_json_serializable(data):
         else:
             # Return the object as is if it's already serializable
             return data
+
+    def query_all_in_one(self, question: str, help_msg, skip_summary: bool = False):
+        """Query all in one big session."""
+        ltp_doc = {'lucia training platform documentation': self.ltp_documentation}
+
+        sys_prompt0 = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'gen_result_summary_doc.txt'))
+        sys_prompt1 = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'gen_result_summary_metrics.txt'))
+        sys_prompt2 = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'gen_result_summary_metadata.txt'))
+        sys_prompt3 = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'gen_result_summary_dashboard.txt'))
+        sys_prompt4 = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'gen_result_summary_rejection.txt'))
+        sys_prompt5 = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'gen_result_summary_human.txt'))
+        sys_prompt = (
+            sys_prompt0 + '\n\n' +
+            sys_prompt1 + '\n\n' +
+            sys_prompt2 + '\n\n' +
+            sys_prompt3 + '\n\n' +
+            sys_prompt4 + '\n\n' +
+            sys_prompt5 + '\n\n the user manual of lucia training platform is: \n\n' +
+            self.ltp_documentation
+        )
+
+        summary = self.llm_session.chat(sys_prompt, question)
+
+        info_dict = {}
+        return summary, info_dict
\ No newline at end of file
diff --git a/src/copilot-chat/src/copilot_agent/utils/__init__.py b/src/copilot-chat/src/copilot_agent/utils/__init__.py
index 716a9700..d0161dad 100644
--- a/src/copilot-chat/src/copilot_agent/utils/__init__.py
+++ b/src/copilot-chat/src/copilot_agent/utils/__init__.py
@@ -47,7 +47,7 @@
     RestAPIClient,
 )
 from .smart_help import (
-    gen_smart_help,
+    SmartHelp,
 )
 from .sql import (
     SQLManager,
@@ -76,6 +76,7 @@
     'PowerBIClient',
     'QuestionClassifier',
     'QueryGeneratorRAG',
+    'SmartHelp',
     'SQLManager',
     'RestAPIClient',
     'dcw_parser',
@@ -89,7 +90,6 @@
     'gen_kusto_query_fallback_pseudo',
     'gen_kusto_query_pseudo',
     'gen_size_range',
-    'gen_smart_help',
     'gen_sql_query',
     'gen_summary',
     'get_current_unix_timestamp',
diff --git a/src/copilot-chat/src/copilot_agent/utils/smart_help.py b/src/copilot-chat/src/copilot_agent/utils/smart_help.py
index e0679b1b..157496c2 100644
--- a/src/copilot-chat/src/copilot_agent/utils/smart_help.py
+++ b/src/copilot-chat/src/copilot_agent/utils/smart_help.py
@@ -10,42 +10,48 @@
 from ..utils.utils import get_prompt_from
 
 
-
-# generate help message for CoPilot agent
-def gen_smart_help(help_msg, user_question: str, key_lst: list, SMART_HELP=True, llm_session=None) -> str:
-    """Generate smart help message for CoPilot agent."""
-    # dump help method
-    dump_help = ''
-    if isinstance(help_msg, dict):
-        for key in key_lst:
-            if key in help_msg:
-                dump_help += help_msg[key]
-                dump_help += '\n\n'
-    # version
-    _version = 'f0f1'
-    if COPILOT_VERSION == 'f2':
-        _version = 'f2'
-    # capability
-    if _version == 'f2':
-        capability_str = help_msg['feature'] + help_msg['sku'] + help_msg['workload']
-    elif _version == 'f0f1':
-        capability_str = help_msg['feature']
-    else:
-        capability_str = help_msg['feature']
-
-    sys_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'gen_smart_help_prompt.txt'))
+class SmartHelp:
+    """Smart help generator for CoPilot agent."""
     
-
-    if SMART_HELP:
-        # smart help method
-        capability_promp = f'[features]\n {capability_str} \n\n'
-        question_prompt = f'[user question]\n {user_question} \n\n'
-        user_prompt = question_prompt + f'[reason to generate the help]\n str{key_lst} \n\n' + capability_promp
-        # send to a LLM session to generate a smart help
-        smart_help = llm_session.try_stream_fallback_chat(sys_prompt, user_prompt)
-        final_help = smart_help
-    else:
-        dump_help_prompt = f'[reason to generate the help]\n {dump_help} \n\n'
-        final_help = llm_session.try_stream_fallback_chat(sys_prompt, dump_help_prompt)
-
-    return final_help
+    def __init__(self, help_msg: dict, llm_session: LLMSession):
+        """Initialize with cached prompts."""
+        self.help_msg = help_msg
+        self.llm_session = llm_session
+        self._version = 'f0f1' if COPILOT_VERSION != 'f2' else 'f2'
+        
+        # Load prompt once during initialization
+        self.sys_prompt = get_prompt_from(
+            os.path.join(PROMPT_DIR, 'gen_smart_help_prompt.txt')
+        )
+        
+        # Prepare capability string based on version
+        if self._version == 'f2':
+            self.capability_str = (help_msg['feature'] + 
+                                  help_msg['sku'] + 
+                                  help_msg['workload'])
+        else:
+            self.capability_str = help_msg['feature']
+    
+    def generate(self, user_question: str, key_lst: list, 
+                 smart_help: bool = True) -> str:
+        """Generate smart help message."""
+        # Build dump_help from keys
+        dump_help = '\n\n'.join(
+            self.help_msg[key] for key in key_lst 
+            if key in self.help_msg
+        )
+        
+        if smart_help:
+            capability_prompt = f'[features]\n {self.capability_str} \n\n'
+            question_prompt = f'[user question]\n {user_question} \n\n'
+            user_prompt = (question_prompt + 
+                          f'[reason to generate the help]\n {key_lst} \n\n' + 
+                          capability_prompt)
+            return self.llm_session.try_stream_fallback_chat(
+                self.sys_prompt, user_prompt
+            )
+        else:
+            dump_help_prompt = f'[reason to generate the help]\n {dump_help} \n\n'
+            return self.llm_session.try_stream_fallback_chat(
+                self.sys_prompt, dump_help_prompt
+            )
\ No newline at end of file
diff --git a/src/copilot-chat/src/copilot_agent/utils/summary.py b/src/copilot-chat/src/copilot_agent/utils/summary.py
index 3a000581..b1c7cb7e 100644
--- a/src/copilot-chat/src/copilot_agent/utils/summary.py
+++ b/src/copilot-chat/src/copilot_agent/utils/summary.py
@@ -11,7 +11,6 @@
 
 from ..config import TOKEN_LIMIT, PROMPT_DIR
 from ..utils.llmsession import LLMSession
-from ..utils.smart_help import gen_smart_help
 from ..utils.utils import get_prompt_from
 
 
@@ -51,8 +50,8 @@ def gen_summary(
     else:
         logger.info('generating smart help')
         help_keys = ['corrupted_data']
-        summary = help_keys[0]
-        summary = gen_smart_help(help_msg, question, help_keys, llm_session)
+        sys_prompt = help_msg[help_keys[0]] if help_keys[0] in help_msg else help_keys[0]
+        summary = llm_session.try_stream_fallback_chat(sys_prompt, question)
     return summary
 
 

From 01384ec2c396b3b956f889f9a4ac637794488989 Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Wed, 22 Oct 2025 13:31:05 -0700
Subject: [PATCH 18/34] minor bug fix

---
 src/copilot-chat/src/copilot_agent/copilot_turn.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/copilot-chat/src/copilot_agent/copilot_turn.py b/src/copilot-chat/src/copilot_agent/copilot_turn.py
index 6d61080e..f52f28e6 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_turn.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_turn.py
@@ -69,6 +69,7 @@ def process_turn(self, messages_list: list, skip_summary: bool = False, debuggin
 
         # verion f3, resolves objective 8 (Lucia Training Platform)
         push_frontend_event('<span class="text-gray-400 italic">⏳ Copilot is processing your inquiry...</span><br/>', replace=False)
+        self.smart_help.llm_session = self.llm_session  # ensure processor uses the current llm_session
         if self._version == 'f3':
             if obj.count('8') > 0:
                 answer, debug = self.query_ltp(question, con, skip_summary)

From 16f4c63e2daf2e290c07e5d3092371bcc577a173 Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Thu, 23 Oct 2025 11:26:51 -0700
Subject: [PATCH 19/34] improve: response latency by merge small llm chat calls
 into one classification session

---
 .../src/copilot_agent/copilot_turn.py         |  35 +-
 .../prompts/classification/f4/classify.txt    |  99 +++
 .../prompts/classification/f4/examples.txt    | 582 ++++++++++++++++++
 ...awise_help_f3.json => infrawise_help.json} |   0
 .../src/copilot_agent/utils/classify.py       |  18 +-
 5 files changed, 719 insertions(+), 15 deletions(-)
 create mode 100644 src/copilot-chat/src/copilot_agent/prompts/classification/f4/classify.txt
 create mode 100644 src/copilot-chat/src/copilot_agent/prompts/classification/f4/examples.txt
 rename src/copilot-chat/src/copilot_agent/prompts/help/{infrawise_help_f3.json => infrawise_help.json} (100%)

diff --git a/src/copilot-chat/src/copilot_agent/copilot_turn.py b/src/copilot-chat/src/copilot_agent/copilot_turn.py
index f52f28e6..ab04641a 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_turn.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_turn.py
@@ -47,7 +47,7 @@ def process_turn(self, messages_list: list, skip_summary: bool = False, debuggin
         set_thread_llm_session(self.llm_session)
 
         # get from message list
-        push_frontend_event('<span class="text-gray-400 italic">🤔 Copilot is understanding your request...</span><br/>', replace=False)
+
         this_inquiry = messages_list[-1]['content']
         last_inquiry = messages_list[-3]['content'] if len(messages_list) > 2 else None
 
@@ -58,19 +58,26 @@ def process_turn(self, messages_list: list, skip_summary: bool = False, debuggin
             answer, debug = self.processor.query_all_in_one(question, self.help_msg, skip_summary)
             debug = {}
             return {'category': question_type, 'answer': answer, 'debug': debug}
-
-        # get contextualized question from this and last user inquiry
-        question = self.contextualizer.contextualize(this_inquiry, last_inquiry)
-
-        # classify the question to determine the solution source and method
-        push_frontend_event('<span class="text-gray-400 italic">🔍 Copilot is finding the right the data source...</span><br/>', replace=False)
-        question_type = self.classifier.classify_question(question)
-        obj, con = question_type.get('lv0_object', '3. [general]'), question_type.get('lv1_concern', '0. [others]')
-
-        # verion f3, resolves objective 8 (Lucia Training Platform)
+        
+        if self._version == 'f4':
+            push_frontend_event('<span class="text-gray-400 italic">🧠 Copilot is understanding your request...</span><br/>', replace=False)
+            question_type = self.classifier.parse_question(this_inquiry, last_inquiry)
+            question = question_type.get('new_question', this_inquiry)
+            obj = question_type.get('lv0_object', '3. [general]')
+            con = question_type.get('lv1_concern', '0. [others]')
+        else:
+            # get contextualized question from this and last user inquiry
+            push_frontend_event('<span class="text-gray-400 italic">🤔 Copilot is understanding your request...</span><br/>', replace=False)
+            question = self.contextualizer.contextualize(this_inquiry, last_inquiry)
+            # classify the question to determine the solution source and method
+            push_frontend_event('<span class="text-gray-400 italic">🔍 Copilot is finding the right the data source...</span><br/>', replace=False)
+            question_type = self.classifier.classify_question(question)
+            obj, con = question_type.get('lv0_object', '3. [general]'), question_type.get('lv1_concern', '0. [others]')
+
+        # verion f3, f4, resolves objective 8 (Lucia Training Platform)
         push_frontend_event('<span class="text-gray-400 italic">⏳ Copilot is processing your inquiry...</span><br/>', replace=False)
         self.smart_help.llm_session = self.llm_session  # ensure processor uses the current llm_session
-        if self._version == 'f3':
+        if self._version in ['f3', 'f4']:
             if obj.count('8') > 0:
                 answer, debug = self.query_ltp(question, con, skip_summary)
             elif obj.count('3') > 0:
@@ -141,7 +148,7 @@ def get_preload_dashboard(self):
 
     def load_help_message(self) -> dict:
         """Load the help message based on the version."""
-        help_doc_path = os.path.join(PROMPT_DIR, 'help', f'infrawise_help_{self._version}.json')
+        help_doc_path = os.path.join(PROMPT_DIR, 'help', f'infrawise_help.json')
         if not os.path.exists(help_doc_path):
             logger.error(f'Help doc not found: {help_doc_path}')
             raise FileNotFoundError(f'Help doc not found: {help_doc_path}')
@@ -152,7 +159,7 @@ def load_help_message(self) -> dict:
 
     def _initialize_version(self) -> str:
         """Determine and set the version."""
-        allowed_versions = {'f2', 'f3', 'f0f1'}
+        allowed_versions = {'f2', 'f3', 'f0f1', 'f4'}
         version = COPILOT_VERSION if COPILOT_VERSION in allowed_versions else 'f3'
         logger.info(f'CoPilot - ver:{version}')
         return version
diff --git a/src/copilot-chat/src/copilot_agent/prompts/classification/f4/classify.txt b/src/copilot-chat/src/copilot_agent/prompts/classification/f4/classify.txt
new file mode 100644
index 00000000..604e4c1d
--- /dev/null
+++ b/src/copilot-chat/src/copilot_agent/prompts/classification/f4/classify.txt
@@ -0,0 +1,99 @@
+Your task is to contextualize the user's question, and understand the user's intention from the question they asked, and assign a object and a concern as an indicator of the type of the question.
+
+Please output  in the following format:
+```json
+{
+  "new_question": "",
+  "lv0_object": "",
+  "lv1_concern": ""
+}
+```
+
+# method for contextualize the user's question
+
+you need to first classify the relationship between this_question and last_question as either a 'direct follow-up' or a 'new query'. A 'direct follow-up' is a question that uses conversational shortcuts or comparative phrases (e.g., "how about X?", "what about X?", "why?") to build directly upon the previous question's intent. If it's classified as a 'direct follow-up', you should complement the missing language concepts (intent, specific entities, and attributes) from the last question. If it's a 'new query' (meaning it introduces a new topic, metric, or entity, even if it shares a common attribute like a week number), you must NOT contextualize it and should return the question as is.
+
+# method for identifying the object
+
+Four question objects are supported:
+
+8. [training_platform]: it means the user wants to ask for guidance of how to use a model training platform, specifically the Lucia Training Platform, or ask for detailed information about an already launched training job, usually such questions can be answered by searching or querying the detailed logs, debugging informations provided by the training platform.
+3. [general]: it means user's can be answered fully using public information or general knowledge, without the need to ask for more context input from user.
+9. [persona]: it means the user wants to understand the persona of you, what features you can provide, what capabilities you can serve.
+4. [others]: it means user's prompt is not [training_platform] nor [general] nor [persona]. Note that the above types have specific definition. If user question is not belong to [training_platform] nor [general] nor [persona], it should be [others].
+
+below knowldge describes some key concepts commonly asked in the training_platform object questions:
+knowledge #1:
+single node trainig, distributed training, how to config a job, how to submit an issue related to the training platform, etc.
+
+knowledge #2:
+the user manual of LTP (Lucia Training Platform) provides a detailed usage guidance to assist user better utilize the resource to perform single or distributed training task per their request.
+
+knowledge #3: the whole user manual consists several sections of documentation, with each focusing on explaining the detail instructions, configurations, information. Available sections are:
+- [Quick Start](./UserManual/quickstart.md): A guide to help you quickly get started with the platform and submit a "hello world" job.
+- [How to Use Docker Image](./UserManual/docker-images.md): Instructions on using Docker images in your jobs.
+- [How to Manage Data and Code](./UserManual/use-data.md): Guidance on managing data and code within your jobs.
+- [How to Write Job Configuration](./UserManual/job-config.md): Detailed instructions for configuring jobs and distributed jobs.
+- [Job Priorities](./UserManual/job-priorities.md): Explanation of job priority types and how to submit jobs with specific priorities.
+- [VC Allocation](./UserManual/vc-allocation.md): Guidelines for managing VC allocation requests and assignment change notifications.
+- [Notification and Monitoring](./UserManual/notification.md): Information on the platform's notification and monitoring features.
+- [Troubleshooting](./UserManual/troubleshooting.md): A troubleshooting guide for common issues and how to get platform support.
+- [Email Templates, UserGroup Admin](./UserManual/email-templates/email-templates-user.md): Templates for UserGroup Admins to request VC allocations, production priority job submissions and integrating private Azure storage blob.
+- [Email Templates, Lucia Training Platform Admin](./UserManual/email-templates/email-templates-ltp.md): Templates for Lucia Training Platform Admins to acknowledge, complete, and notify about VC allocation and assignment changes.
+
+
+knowledge #4: there is a Lucia Training Platform Dashboard which includes data to let platform administrators and platform users to get detailed information about the performance, availability, failures, and resource utilization:
+- Cluster Capacity Status
+- Availability Status
+- Node Health and Failures
+- Job Interruption Analysis
+- Node Recycle Times
+- Mean Time Between Failures (MTBF)
+- Daily Node Runtime Failures
+- Failure Categorization and Reasons
+- Daily Node Failure Occurrences
+
+# method for identifying the concern:
+
+Seven concerns are supported:
+
+1. [cluster_job_metrics]: it means the key concern of the user is to understand the current status overview of an object. Usually it requires perform root cause analysis to answer user's concerns.
+2. [job_metadata]: it means the key concern of the user is to understand the root cause of an issue or problem about an object. Usually it requires perform root cause analysis to answer user's concerns.
+3. [user_manual]: it means the key concern of the user is to understand a general concept on what is Lucia Training Platform, or LTP, or training platform, usually the question embedded with a intention to understand how to use the platform.
+4. [dashboard]: it means the key concern of the user is to get detailed information about the performance, availability, failures, and resource utilization of a model training platform.
+5. [auto_rejection]: it means the key concern of the user is not supported by the design of the trainig platform, the examples below provides some unsupported feature for you to compare with.
+6. [human_intervention]: it means the key concern of the user is: not straightforward and may involve multiple potential causes, requiring human judgment to diagnose and resolve, or highlights failures or malfunctions in systems, resources, or infrastructure that cannot be automatically diagnose (e.g, ssh connection issue, GPU failure, node failure, job failure, portal crash, etc.), or explicitly or implicitly requests human assistance to investigate, diagnose, or fix the problem.
+0. [others]: it means the key concern of the user is too general to abstrct.
+
+below knowldge describes some key concepts commonly asked in the cluster_job_metrics concern questions:
+knowledge #1: metrics backend is Prometheus REST API server
+knowledge #2: metrics could be cluster metrics, job metrics, filtered by specific virtual cluster (VC) or user, etc.
+
+below knowldge describes some key concepts commonly asked in the user_manual concern questions:
+knowledge #1: the user manual of LTP (Lucia Training Platform) provides a detailed usage guidance to assist user better utilize the resource to perform single or distributed training task per their request.
+knowledge #2: the whole user manual consists several sections of documentation, with each focusing on explaining the detail instructions, configurations, information. Available sections are:
+- [Quick Start](./UserManual/quickstart.md): A guide to help you quickly get started with the platform and submit a "hello world" job.
+- [How to Use Docker Image](./UserManual/docker-images.md): Instructions on using Docker images in your jobs.
+- [How to Manage Data and Code](./UserManual/use-data.md): Guidance on managing data and code within your jobs.
+- [How to Write Job Configuration](./UserManual/job-config.md): Detailed instructions for configuring jobs and distributed jobs.
+- [Job Priorities](./UserManual/job-priorities.md): Explanation of job priority types and how to submit jobs with specific priorities.
+- [VC Allocation](./UserManual/vc-allocation.md): Guidelines for managing VC allocation requests and assignment change notifications.
+- [Notification and Monitoring](./UserManual/notification.md): Information on the platform's notification and monitoring features.
+- [Troubleshooting](./UserManual/troubleshooting.md): A troubleshooting guide for common issues and how to get platform support.
+- [Email Templates, UserGroup Admin](./UserManual/email-templates/email-templates-user.md): Templates for UserGroup Admins to request VC allocations, production priority job submissions and integrating private Azure storage blob.
+- [Email Templates, Lucia Training Platform Admin](./UserManual/email-templates/email-templates-ltp.md): Templates for Lucia Training Platform Admins to acknowledge, complete, and notify about VC allocation and assignment changes.
+knowledge #3: [How to Write Job Configuration] section details below topics: Parameters and Secrets, Distributed Settings, Environment Variables for Distributed Jobs, Job Exit Specifications
+
+below knowledge describes what dashboard data provides:
+knowledge #4: there is a Lucia Training Platform Dashboard which includes data to let platform administrators and platform users to get detailed information about the performance, availability, failures, and resource utilization:
+- Cluster Capacity Status
+- Availability Status
+- Node Health and Failures
+- Job Interruption Analysis
+- Node Recycle Times
+- Mean Time Between Failures (MTBF)
+- Daily Node Runtime Failures
+- Failure Categorization and Reasons
+- Daily Node Failure Occurrences
+
+
diff --git a/src/copilot-chat/src/copilot_agent/prompts/classification/f4/examples.txt b/src/copilot-chat/src/copilot_agent/prompts/classification/f4/examples.txt
new file mode 100644
index 00000000..43bf2da5
--- /dev/null
+++ b/src/copilot-chat/src/copilot_agent/prompts/classification/f4/examples.txt
@@ -0,0 +1,582 @@
+# Following are some examples
+
+## set a
+
+[Example a.1]
+this_question: how about week 27?
+last_question: what is the failure node count for week 28
+thinking: "how about week 27?" would be classified as a direct follow-up. It's a comparative phrase ("how about") and the only new entity is "week 27." The model would then infer the intent and attributes from the last question ("what is the failure node count for...") and correctly build the new question.
+output:
+```json
+{
+  "new_question": "what is the failure node count for week 27?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example a.2]
+this_question: what can you do?
+last_question: what is the failure node count for week 28
+output:
+```json
+{
+  "new_question": "what can you do?",
+  "lv0_object": "9",
+  "lv1_concern": "0"
+}
+```
+
+[Example a.3]
+this_question: What is the average job duration for hardware failures in week 30?
+last_question: What is the percentage of "Available VMs (Empty)" in the LON64PrdGPC01 cluster in week 30?
+thinking: "What is the average job duration for hardware failures in week 30?" would be classified as a new query. While it shares "week 30" with the last question, its core intent (average job duration) and main entity (hardware failures) are completely different from the last question's intent (percentage of available VMs) and entity (LON64PrdGPC01 cluster). The new logic would correctly decide not to apply any contextualization.
+output:
+```json
+{
+  "new_question": "What is the average job duration for hardware failures in week 30?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+## set b
+
+[Example b.1]
+this_question
+Is distributed training supported?
+output:
+```json
+{
+  "new_question": "Is distributed training supported?",
+  "lv0_object": "8",
+  "lv1_concern": "3"
+}
+```
+
+[Example b.3]
+this_question
+what is the weather
+output:
+```json
+{
+  "new_question": "what is the weather",
+  "lv0_object": "4",
+  "lv1_concern": "0"
+}
+```
+
+[Example b.7]
+this_question
+What is Chrismax
+output:
+```json
+{
+  "new_question": "What is Chrismax",
+  "lv0_object": "3",
+  "lv1_concern": "0"
+}
+```
+
+[Example b.8]
+this_question
+Hi!
+output:
+```json
+{
+  "new_question": "Hi!",
+  "lv0_object": "3",
+  "lv1_concern": "0"
+}
+```
+
+[Example b.9]
+this_question
+What can you do?
+output:
+```json
+{
+  "new_question": "What can you do?",
+  "lv0_object": "9",
+  "lv1_concern": "0"
+}
+```
+
+[Example b.10]
+this_question
+What is the average availability ratio for the cluster CYS13PrdGPC02 in Week 27?
+output:
+```json
+{
+  "new_question": "What is the average availability ratio for the cluster CYS13PrdGPC02 in Week 27?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.11]
+this_question
+Which cluster had the highest percentage of allocated VMs in Week 22?
+output:
+```json
+{
+  "new_question": "Which cluster had the highest percentage of allocated VMs in Week 22?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.12]
+this_question
+What was the most common reason for node failures?
+output:
+```json
+{
+  "new_question": "What was the most common reason for node failures?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.13]
+this_question
+What is the Mean Time Between Failures (MTBF) for the cluster CYS13PrdGPC02 in Week 27?
+output:
+```json
+{
+  "new_question": "What is the Mean Time Between Failures (MTBF) for the cluster CYS13PrdGPC02 in Week 27?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.14]
+this_question
+What is the average reaction time for hardware failures in Week 27?
+output:
+```json
+{
+  "new_question": "What is the average reaction time for hardware failures in Week 27?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.15]
+this_question
+How many job interruptions occurred due to hardware failures, and what was the average duration of these interruptions?
+output:
+```json
+{
+  "new_question": "How many job interruptions occurred due to hardware failures, and what was the average duration of these interruptions?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.16]
+this_question
+Which cluster had the highest percentage of unallocatable VMs in Week 27?
+output:
+```json
+{
+  "new_question": "Which cluster had the highest percentage of unallocatable VMs in Week 27?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.17]
+this_question
+What was the total number of hardware-related node failures in Week 27 across all endpoints?
+output:
+```json
+{
+  "new_question": "What was the total number of hardware-related node failures in Week 27 across all endpoints?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.18]
+this_question
+What was the node recycle time for cordon nodes in the CYS13PrdGPC02 cluster in Week 25?
+output:
+```json
+{
+  "new_question": "What was the node recycle time for cordon nodes in the CYS13PrdGPC02 cluster in Week 25?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.19]
+this_question
+What was the average availability ratio for the cluster CYS13PrdGPC02 in Week 27?
+output:
+```json
+{
+  "new_question": "What was the average availability ratio for the cluster CYS13PrdGPC02 in Week 27?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.20]
+this_question
+How many deallocated nodes were reported for the cluster SJC24PrdGPC03?
+output:
+```json
+{
+  "new_question": "How many deallocated nodes were reported for the cluster SJC24PrdGPC03?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.21]
+this_question
+introduce yourself
+output:
+```json
+{
+  "new_question": "introduce yourself",
+  "lv0_object": "9",
+  "lv1_concern": "0"
+}
+```
+
+[Example b.22]
+this_question
+There is no mechanism to prevent Virtual Cluster (VC) resource abuse, such as requesting an excessive number of GPUs.
+output:
+```json
+{
+  "new_question": "There is no mechanism to prevent Virtual Cluster (VC) resource abuse, such as requesting an excessive number of GPUs.",
+  "lv0_object": "8",
+  "lv1_concern": "5"
+}
+```
+
+[Example b.23]
+this_question
+Users may be duplicating efforts by building Docker images that have already been created by others.
+output:
+```json
+{
+  "new_question": "Users may be duplicating efforts by building Docker images that have already been created by others.",
+  "lv0_object": "8",
+  "lv1_concern": "5"
+}
+```
+
+[Example b.24]
+this_question
+I have been trying to get 4 nodes and successfully been allocated. However, it gets stuck three nodes are running and one keeps waiting -- blocking the whole process. Can you please take a look at it?
+output:
+```json
+{
+  "new_question": "I have been trying to get 4 nodes and successfully been allocated. However, it gets stuck three nodes are running and one keeps waiting -- blocking the whole process. Can you please take a look at it?",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.25]
+this_question
+seems that we meet some network issue today, some nodes can connect to archive.ubuntu.com but some cannot, could you help to check? Err:12 jammy InRelease  Could not connect to archive.ubuntu.com:80 (185.125.190.81), connection timed out Could not connect to archive.ubuntu.com:80 (91.189.91.82), connection timed out Could not connect to archive.ubuntu.com:80 (185.125.190.83), connection timed out Could not connect to archive.ubuntu.com:80 (185.125.190.82), connection timed out Could not connect to archive.ubuntu.com:80 (91.189.91.83), connection timed out
+output:
+```json
+{
+  "new_question": "seems that we meet some network issue today, some nodes can connect to archive.ubuntu.com but some cannot, could you help to check? Err:12 jammy InRelease  Could not connect to archive.ubuntu.com:80 (185.125.190.81), connection timed out Could not connect to archive.ubuntu.com:80 (91.189.91.82), connection timed out Could not connect to archive.ubuntu.com:80 (185.125.190.83), connection timed out Could not connect to archive.ubuntu.com:80 (185.125.190.82), connection timed out Could not connect to archive.ubuntu.com:80 (91.189.91.83), connection timed out",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.26]
+this_question
+We meet this issue, seems the portal is crashed.
+output:
+```json
+{
+  "new_question": "We meet this issue, seems the portal is crashed.",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.27]
+this_question
+We have a 16 node job failed and we have try to resume several times, each time failed seems because of communication, could you help to find if there are some bad nodes in it? These are job links:
+output:
+```json
+{
+  "new_question": "We have a 16 node job failed and we have try to resume several times, each time failed seems because of communication, could you help to find if there are some bad nodes in it? These are job links:",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.28]
+this_question
+When I try to submit job this afternoon, the portal says Error
+output:
+```json
+{
+  "new_question": "When I try to submit job this afternoon, the portal says Error",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.29]
+this_question
+we have a 16-node job that seems to be hanging during startup ‚Äî the GPU utilization stays at 0. We've retried multiple times on the same 16 nodes, but the issue persists each time. Could you please help check if there might be any issues with these nodes? here is the job link
+output:
+```json
+{
+  "new_question": "we have a 16-node job that seems to be hanging during startup ‚Äî the GPU utilization stays at 0. We've retried multiple times on the same 16 nodes, but the issue persists each time. Could you please help check if there might be any issues with these nodes? here is the job link",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.30]
+this_question
+Thanks to the powerful Lucia platform, we have successfully started several 16-node experiments. However, when some experiments are stopped, it seems that the portal does not show that these nodes have been released. This may also cause the jobs submitted later to be waiting all the time. And is it possible to replace the 3 bad nodes in the cluster so that we can start an additional experiment or scale up the current experiment, which will speed up our experiment progress.
+output:
+```json
+{
+  "new_question": "Thanks to the powerful Lucia platform, we have successfully started several 16-node experiments. However, when some experiments are stopped, it seems that the portal does not show that these nodes have been released. This may also cause the jobs submitted later to be waiting all the time. And is it possible to replace the 3 bad nodes in the cluster so that we can start an additional experiment or scale up the current experiment, which will speed up our experiment progress.",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.31]
+this_question
+It seems that there are some potentially faulty nodes on the <code>rstar</code> VC. When we submit multi-node jobs, they may randomly land on these problematic nodes, causing job failures‚Äîeven though the codebase remains unchanged. Here are two job links that may help illustrate the issue: Would it be possible to detect these potentially bad nodes and report them for maintenance? Thanks for your help!
+output:
+```json
+{
+  "new_question": "It seems that there are some potentially faulty nodes on the <code>rstar</code> VC. When we submit multi-node jobs, they may randomly land on these problematic nodes, causing job failures‚Äîeven though the codebase remains unchanged. Here are two job links that may help illustrate the issue: Would it be possible to detect these potentially bad nodes and report them for maintenance? Thanks for your help!",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.32]
+this_question
+sigma-s-mi300 shows it has been fully used (100%, so no more jobs can be submitted), but just half of the GPUs and CPUs shown been occupied. See the attached screenshot.
+output:
+```json
+{
+  "new_question": "sigma-s-mi300 shows it has been fully used (100%, so no more jobs can be submitted), but just half of the GPUs and CPUs shown been occupied. See the attached screenshot.",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.33]
+this_question
+This job encountered "No CUDA GPUs are available".
+output:
+```json
+{
+  "new_question": "This job encountered "No CUDA GPUs are available".",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.34]
+this_question
+As far as we know, there are three ways for us to communicate with blob storage: Using storage configs for read and write a public blob container. Using blobfuse2 to manually mount our private blob containers for rw. Using azcopy to pre-download necessary files and post-upload results to some specific blob containers. When we are training small models like less than 2B model, the problem may not so obvious and may not cause timeout. But recently, we are trying 7B and 10B models, whose ckpt will be 110G+ (together with the sliced states of each GPU), we may need to continually start with a previously trained ckpt and save the updated ckpts periodically. This demand may require not slow IO speed with blob. We try to save a ckpt after the second step in a training experiment. The current results are: Saving 7B model with method 1 will be timeout:  Saving 7B model with method 2 is okay, Method 3 require too much labour for saving ckpts periodically. what is the Best Practice of Using Blob Storage?"
+output:
+```json
+{
+  "new_question": "As far as we know, there are three ways for us to communicate with blob storage: Using storage configs for read and write a public blob container. Using blobfuse2 to manually mount our private blob containers for rw. Using azcopy to pre-download necessary files and post-upload results to some specific blob containers. When we are training small models like less than 2B model, the problem may not so obvious and may not cause timeout. But recently, we are trying 7B and 10B models, whose ckpt will be 110G+ (together with the sliced states of each GPU), we may need to continually start with a previously trained ckpt and save the updated ckpts periodically. This demand may require not slow IO speed with blob. We try to save a ckpt after the second step in a training experiment. The current results are: Saving 7B model with method 1 will be timeout:  Saving 7B model with method 2 is okay, Method 3 require too much labour for saving ckpts periodically. what is the Best Practice of Using Blob Storage?"",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+## set c
+
+[Example c.1]
+this_question:
+what is the idle gpu counts for virtual cluster (vc): mi300s?
+output:
+```json
+{
+  "new_question": "what is the idle gpu counts for virtual cluster (vc): mi300s?",
+  "lv0_object": "8",
+  "lv1_concern": "1"
+}
+```
+
+[Example c.2]
+this_question:
+query allocated gpu counts, end time is now, offset is 1 day
+output:
+```json
+{
+  "new_question": "query allocated gpu counts, end time is now, offset is 1 day",
+  "lv0_object": "8",
+  "lv1_concern": "1"
+}
+
+[Example c.3]
+this_question:
+query job meta data
+output:
+```json
+{
+  "new_question": "query job meta data",
+  "lv0_object": "8",
+  "lv1_concern": "2"
+}
+
+[Example c.4]
+this_question:
+how to submit a distributed training job?
+output:
+```json
+{
+  "new_question": "how to submit a distributed training job?",
+  "lv0_object": "8",
+  "lv1_concern": "3"
+}
+
+[Example c.5]
+this_question:
+write/create a new benchmark for ...
+output:
+```json
+{
+  "new_question": "write/create a new benchmark for ...",
+  "lv0_object": "4",
+  "lv1_concern": "0"
+}
+```
+
+[Example c.6]
+this_question:
+query a list of running or history jobs
+output:
+```json
+{
+  "new_question": "query a list of running or history jobs",
+  "lv0_object": "8",
+  "lv1_concern": "1"
+}
+```
+
+[Example c.31]
+this_question:
+I'm new to the platform, it would be nice if I could access the job via ssh for debugging, currently I can only modify the yaml file and rerun the job for debugging, which is a bit cumbersome.
+output:
+```json
+{
+  "new_question": "I'm new to platform, it would be nice if I could access the job via ssh for debugging, currently I can only modify the yaml file and rerun the job for debugging, which is a bit cumbersome.",
+  "lv0_object": "8",
+  "lv1_concern": "3"
+}
+```
+
+[Example c.32]
+this_question:
+Is there any jump machine that can be used to ssh to the MI300x machine?
+output:
+```json
+{
+  "new_question": "Is there any jump machine that can be used to ssh to the MI300x machine?",
+  "lv0_object": "8",
+  "lv1_concern": "3"
+}
+```
+
+[Example c.33]
+this_question:
+I conducted experiments using the same environment and dataset on both a single node and multiple nodes. From the experiments, I observed the following:</p>\n<ol>\n<li>The total training steps for a single node are 8,638, with an estimated training time of around 52 hours.<a href=\"https://we.openpai.org/job-detail.html?username=xinzhang3&amp;jobName=qwencoder25-7b-cl-powershell-1n-ts-speed-2\" rel=\"noreferrer noopener\" title=\"https://we.openpai.org/job-detail.html?username=xinzhang3&amp;jobName=qwencoder25-7b-cl-powershell-1n-ts-speed-2\" target=\"_blank\">Platform for AI</a></li><li>The total training steps for 4 nodes are 2,159, with an estimated training time of around 77 hours. <a href=\"https://we.openpai.org/job-detail.html?username=xinzhang3&amp;jobName=qwencoder25-7b-cl-powershell-4n-ts-2159\" rel=\"noreferrer noopener\" title=\"https://we.openpai.org/job-detail.html?username=xinzhang3&amp;jobName=qwencoder25-7b-cl-powershell-4n-ts-2159\" target=\"_blank\">Platform for AI</a></li></ol>\n<p>I am wondering whether there might be some communication overhead causing the training time on multiple nodes to be higher than on a single node. Thank you!
+output:
+```json
+{
+  "new_question": "I conducted experiments using the same environment and dataset on both a single node and multiple nodes. From the experiments, I observed the following:</p>\n<ol>\n<li>The total training steps for a single node are 8,638, with an estimated training time of around 52 hours.<a href=\"https://we.openpai.org/job-detail.html?username=xinzhang3&amp;jobName=qwencoder25-7b-cl-powershell-1n-ts-speed-2\" rel=\"noreferrer noopener\" title=\"https://we.openpai.org/job-detail.html?username=xinzhang3&amp;jobName=qwencoder25-7b-cl-powershell-1n-ts-speed-2\" target=\"_blank\">Platform for AI</a></li><li>The total training steps for 4 nodes are 2,159, with an estimated training time of around 77 hours. <a href=\"https://we.openpai.org/job-detail.html?username=xinzhang3&amp;jobName=qwencoder25-7b-cl-powershell-4n-ts-2159\" rel=\"noreferrer noopener\" title=\"https://we.openpai.org/job-detail.html?username=xinzhang3&amp;jobName=qwencoder25-7b-cl-powershell-4n-ts-2159\" target=\"_blank\">Platform for AI</a></li></ol>\n<p>I am wondering whether there might be some communication overhead causing the training time on multiple nodes to be higher than on a single node. Thank you!",
+  "lv0_object": "8",
+  "lv1_concern": "1"
+}
+```
+
+[Example c.34]
+this_question:
+Users may not be familiar with certain types of GPUs (especially AMD GPUs). Launching jobs on these GPUs often results in a higher failure rate due to lack of familiarity.
+output:
+```json
+{
+  "new_question": "Users may not be familiar with certain types of GPUs (especially AMD GPUs). Launching jobs on these GPUs often results in a higher failure rate due to lack of familiarity.",
+  "lv0_object": "8",
+  "lv1_concern": "3"
+}
+```
+
+[Example c.35]
+this_question:
+The behavior of different job priorities is not transparent to users, making it difficult for them to make informed decisions.
+output:
+```json
+{
+  "new_question": "The behavior of different job priorities is not transparent to users, making it difficult for them to make informed decisions.",
+  "lv0_object": "8",
+  "lv1_concern": "3"
+}
+```
+
+[Example c.36]
+this_question:
+Can I setup a SSH connection to the node?
+output:
+```json
+{
+  "new_question": "Can I setup a SSH connection to the node?",
+  "lv0_object": "8",
+  "lv1_concern": "3"
+}
+```
+
+[Example c.37]
+this_question:
+Users are unable to manage their expectations regarding job completion times due to a lack of visibility into job progress or estimated end times.
+output:
+```json
+{
+  "new_question": "Users are unable to manage their expectations regarding job completion times due to a lack of visibility into job progress or estimated end times.",
+  "lv0_object": "8",
+  "lv1_concern": "5"
+}
+```
+
+[Example c.38]
+this_question:
+Which job had the longest duration in week 30, and what was its exit reason?
+output:
+```json
+{
+  "new_question": "Which job had the longest duration in week 30, and what was its exit reason?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example c.39]
+this_question:
+What are the cluster names in pai-wcu endpoint?
+output:
+```json
+{
+  "new_question": "What are the cluster names in pai-wcu endpoint?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
\ No newline at end of file
diff --git a/src/copilot-chat/src/copilot_agent/prompts/help/infrawise_help_f3.json b/src/copilot-chat/src/copilot_agent/prompts/help/infrawise_help.json
similarity index 100%
rename from src/copilot-chat/src/copilot_agent/prompts/help/infrawise_help_f3.json
rename to src/copilot-chat/src/copilot_agent/prompts/help/infrawise_help.json
diff --git a/src/copilot-chat/src/copilot_agent/utils/classify.py b/src/copilot-chat/src/copilot_agent/utils/classify.py
index d14686f8..415ee671 100644
--- a/src/copilot-chat/src/copilot_agent/utils/classify.py
+++ b/src/copilot-chat/src/copilot_agent/utils/classify.py
@@ -8,7 +8,7 @@
 from ..config import PROMPT_DIR
 
 from ..utils.logger import logger
-from ..utils.utils import get_prompt_from
+from ..utils.utils import get_prompt_from, extract_json_dict
 
 class QuestionClassifier:
     def __init__(self, version, model):
@@ -17,6 +17,9 @@ def __init__(self, version, model):
         if self.version == 'f3':
             self.lv0_system_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'classification/f3/lv0.txt'))
             self.lv1_system_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'classification/f3/lv1.txt'))
+        elif self.version == 'f4':
+            self.lv0_system_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'classification/f4/classify.txt'))
+            self.lv1_system_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'classification/f4/examples.txt'))
         else:
             self.lv0_system_prompt = None
             self.lv1_system_prompt = None
@@ -49,3 +52,16 @@ def classifier_lv1(self, question: str) -> str:
         else:
             resp = '0'  # default to 0
         return resp
+
+    def parse_question(self, this_inquiry: str, last_inquiry: str) -> dict:
+        """Classify the question and return a dictionary with the results."""
+        question_struct = {'new_question': '', 'lv0_object': None, 'lv1_concern': None}
+        resp = self.model.chat(self.lv0_system_prompt + self.lv1_system_prompt, f'this_inquiry: {this_inquiry}, last_inquiry: {last_inquiry}')
+        resp_dict = extract_json_dict(resp, nested=False)
+        if resp_dict:
+            if isinstance(resp_dict, dict):
+                question_struct['new_question'] = resp_dict.get('new_question', '')
+                question_struct['lv0_object'] = resp_dict.get('lv0_object', None)
+                question_struct['lv1_concern'] = resp_dict.get('lv1_concern', None)
+        return question_struct
+    

From 27d8f641a9df5b26e227bab6b3a92c78ee0aa8a4 Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Thu, 23 Oct 2025 18:57:18 -0700
Subject: [PATCH 20/34] improve: response latency, by reuse llmsession for
 requests from the same username

---
 .../src/copilot_agent/copilot_service.py      | 30 +++++++++++--------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/copilot-chat/src/copilot_agent/copilot_service.py b/src/copilot-chat/src/copilot_agent/copilot_service.py
index 3a6049e3..85787fe7 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_service.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_service.py
@@ -48,11 +48,16 @@ def status(self):
         """GET endpoint for health/status check."""
         return jsonify({"status": "running"})
 
-    def get_or_create_session(self, user_id, conv_id, llm_session):
-        """Retrieve or create a copilot_conversation for the given userId and convId."""
+    def get_or_create_session(self, user_id, conv_id):
+        """Retrieve or create a copilot_conversation for the given userId and convId, reusing its LLMSession.
+
+        A new LLMSession is created ONLY when the conversation is first seen; subsequent requests reuse
+        the existing session to avoid repeated client/session setup overhead. This helps reduce per-request
+        latency (~hundreds of ms) previously incurred by constructing new OpenAI/Azure clients.
+        """
         session_key = f"{user_id}_{conv_id}"
         if session_key not in self.sessions:
-            self.sessions[session_key] = CoPilotConversation(llm_session)
+            self.sessions[session_key] = CoPilotConversation(LLMSession())
         return self.sessions[session_key]
 
     def instance_operation(self):
@@ -62,8 +67,8 @@ def instance_operation(self):
             data = request.get_json()
             user_id = data['data']['messageInfo']['userId']
             conv_id = data['data']['messageInfo']['convId']
-            llm_session = LLMSession()
-            copilot_conversation = self.get_or_create_session(user_id, conv_id, llm_session)
+            copilot_conversation = self.get_or_create_session(user_id, conv_id)
+            llm_session = copilot_conversation.llm_session
 
             
             in_parameters = copilot_conversation.build_in_parameters(data)
@@ -97,14 +102,10 @@ def on_chunk(chunk: str):
             data = request.get_json()
             user_id = data['data']['messageInfo']['userId']
             conv_id = data['data']['messageInfo']['convId']
-            llm_session = LLMSession()  # Create a new LLM session
+            copilot_conversation = self.get_or_create_session(user_id, conv_id)
+            llm_session = copilot_conversation.llm_session
+            # Attach streaming callback to existing session (no new session creation)
             llm_session.set_instance_stream_callback(on_chunk)
-            copilot_conversation = self.get_or_create_session(user_id, conv_id, llm_session)  # Pass llm_session
-            
-            # CRITICAL: Update the llm_session in the conversation for subsequent requests
-            # The copilot_turn needs to use the NEW llm_session with the NEW callback
-            copilot_conversation.llm_session = llm_session
-            copilot_conversation.copilot_turn.llm_session = llm_session
         except KeyError as e:
             logger.error(f"Missing key in JSON body for stream_operation: {e}")
             return jsonify({"status": "error", "message": f"Missing key: {e}"}), 400
@@ -126,6 +127,11 @@ def worker():
                 logger.error(traceback.format_exc())
                 q.put(json.dumps({'error': str(e)}))
             finally:
+                # Clear streaming callback to avoid affecting subsequent non-stream requests
+                try:
+                    llm_session.clear_instance_stream_callback()
+                except Exception:
+                    logger.debug('Failed to clear instance stream callback')
                 # signal end of stream
                 q.put(None)
 

From 138ba066e495895cde170612b31d841bb7b1f144 Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Mon, 27 Oct 2025 17:22:15 -0700
Subject: [PATCH 21/34] fix bug: missing import, changed prompt file name

---
 src/copilot-chat/src/copilot_agent/ltp/ltp.py          | 2 +-
 src/copilot-chat/src/copilot_agent/utils/llmsession.py | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/copilot-chat/src/copilot_agent/ltp/ltp.py b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
index adeff406..affc9afc 100644
--- a/src/copilot-chat/src/copilot_agent/ltp/ltp.py
+++ b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
@@ -40,7 +40,7 @@ def __init__(self, llm_session: LLMSession = None) -> None:
         """
         self.llm_session = llm_session
         self.feature_skipped = True
-        self.ltp_documentation = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'ltp_documentation_20250624.txt'))
+        self.ltp_documentation = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'ltp_documentation.txt'))
 
     def query_metrics(self, question: str, help_msg, skip_summary: bool = False):
         """Query cluster or job metrics from Prometheus backend."""
diff --git a/src/copilot-chat/src/copilot_agent/utils/llmsession.py b/src/copilot-chat/src/copilot_agent/utils/llmsession.py
index be942898..307494c4 100644
--- a/src/copilot-chat/src/copilot_agent/utils/llmsession.py
+++ b/src/copilot-chat/src/copilot_agent/utils/llmsession.py
@@ -6,6 +6,10 @@
 import os
 import time
 import openai
+import requests
+import threading
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
 from ..utils.logger import logger
 
 class LLMSession:

From 7ba13744aedf4648f86f7f178424bad4f283b507 Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Tue, 28 Oct 2025 13:08:55 -0700
Subject: [PATCH 22/34] debugging: log the question parsing output into the dev
 log

---
 src/copilot-chat/src/copilot_agent/utils/classify.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/copilot-chat/src/copilot_agent/utils/classify.py b/src/copilot-chat/src/copilot_agent/utils/classify.py
index 415ee671..2626645d 100644
--- a/src/copilot-chat/src/copilot_agent/utils/classify.py
+++ b/src/copilot-chat/src/copilot_agent/utils/classify.py
@@ -63,5 +63,6 @@ def parse_question(self, this_inquiry: str, last_inquiry: str) -> dict:
                 question_struct['new_question'] = resp_dict.get('new_question', '')
                 question_struct['lv0_object'] = resp_dict.get('lv0_object', None)
                 question_struct['lv1_concern'] = resp_dict.get('lv1_concern', None)
+        logger.info(f'Parsed question structure: {question_struct}')
         return question_struct
     

From b9f3d52cbd82c9e8addf6295ffaafcc9fd431ce5 Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Tue, 28 Oct 2025 13:35:27 -0700
Subject: [PATCH 23/34] resolve review comment: add appropriate null handling

---
 src/copilot-chat/src/copilot_agent/utils/summary.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/copilot-chat/src/copilot_agent/utils/summary.py b/src/copilot-chat/src/copilot_agent/utils/summary.py
index b1c7cb7e..1583ef43 100644
--- a/src/copilot-chat/src/copilot_agent/utils/summary.py
+++ b/src/copilot-chat/src/copilot_agent/utils/summary.py
@@ -43,6 +43,8 @@ def gen_summary(
         if not skip_summary:
             logger.info('Bypass summary: False')
             # try stream chat, if fail, fall back to chat
+            if llm_session is None:
+                raise ValueError("llm_session is required when skip_summary is False")
             summary = llm_session.try_stream_fallback_chat(sys_prompt, user_prompt)
         else:
             logger.info('Bypass summary: True')
@@ -51,12 +53,17 @@ def gen_summary(
         logger.info('generating smart help')
         help_keys = ['corrupted_data']
         sys_prompt = help_msg[help_keys[0]] if help_keys[0] in help_msg else help_keys[0]
+        if llm_session is None:
+            raise ValueError("llm_session is required for smart help generation")
         summary = llm_session.try_stream_fallback_chat(sys_prompt, question)
     return summary
 
 
 def handle_bypass_summary(resp_total, resp_brief):
-    """Handle bypass summary logic."""
+    """Handle bypass summary logic.
+
+    Returns full data if under TOKEN_LIMIT else brief data.
+    """
     if len(str(resp_total)) < TOKEN_LIMIT:
         logger.info('Outputing full data')
         return resp_total

From e42bea9f0038e28b1969b99d765bdeb3e549073f Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Tue, 28 Oct 2025 13:36:03 -0700
Subject: [PATCH 24/34] update: nginx configuration to add the new
 /copilot/api/stream endpoint route

---
 contrib/copilot-plugin/src/app/ChatBox.tsx         |  4 ++--
 .../src/copilot_agent/copilot_service.py           |  4 ++--
 .../deploy/pylon-config/location.conf.template     | 14 ++++++++++++++
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/contrib/copilot-plugin/src/app/ChatBox.tsx b/contrib/copilot-plugin/src/app/ChatBox.tsx
index 8f7d9270..779e4a01 100644
--- a/contrib/copilot-plugin/src/app/ChatBox.tsx
+++ b/contrib/copilot-plugin/src/app/ChatBox.tsx
@@ -21,8 +21,8 @@ export default function ChatBox() {
   // Use local backend when running the dev server (npm start),
   // and use the relative path for production builds (npm run build).
   const REMOTE_SERVER_URL = process.env.NODE_ENV === 'development'
-    ? 'http://127.0.0.1:60000/copilot/api/operation/stream'
-    : '/copilot/api/operation/stream';
+    ? 'http://127.0.0.1:60000/copilot/api/stream'
+    : '/copilot/api/stream';
 
   const makeChatRequest = async (e: React.FormEvent) => {
     e.preventDefault();
diff --git a/src/copilot-chat/src/copilot_agent/copilot_service.py b/src/copilot-chat/src/copilot_agent/copilot_service.py
index 85787fe7..d2c7d7df 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_service.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_service.py
@@ -34,7 +34,7 @@ def __init__(self):
         self.app = Flask(__name__)
         self.app.add_url_rule('/copilot/api/status', view_func=self.status, methods=['GET'])
         self.app.add_url_rule('/copilot/api/operation', view_func=self.instance_operation, methods=['POST'])
-        self.app.add_url_rule('/copilot/api/operation/stream', view_func=self.stream_operation, methods=['POST'])
+        self.app.add_url_rule('/copilot/api/stream', view_func=self.stream_operation, methods=['POST'])
 
         # If running in local agent mode, enable CORS to allow local testing from dev frontends.
         if AGENT_MODE_LOCAL:
@@ -89,7 +89,7 @@ def stream_operation(self):
         It sets a module-level callback in the summary module so streaming chunks are
         forwarded to the HTTP response. This avoids changing many internal call chains.
         """
-        logger.info("Received request at /copilot/api/operation/stream")
+        logger.info("Received request at /copilot/api/stream")
 
         # Create queue BEFORE the callback function
         q = queue.Queue()
diff --git a/src/pylon/deploy/pylon-config/location.conf.template b/src/pylon/deploy/pylon-config/location.conf.template
index 02b9e940..71606cb6 100644
--- a/src/pylon/deploy/pylon-config/location.conf.template
+++ b/src/pylon/deploy/pylon-config/location.conf.template
@@ -63,6 +63,20 @@ location ~ ^/copilot/api/operation(.*)$ {
   proxy_send_timeout 2m;
 }
 
+location ~ ^/copilot/api/stream(.*)$ {
+  proxy_pass {{COPILOT_CHAT_URI}}/copilot/api/stream$1$is_args$args;
+  proxy_connect_timeout 2m;
+  proxy_read_timeout 2m;
+  proxy_send_timeout 2m;
+  
+  # Required for SSE (Server-Sent Events) streaming
+  proxy_buffering off;
+  proxy_cache off;
+  proxy_set_header Connection '';
+  proxy_http_version 1.1;
+  chunked_transfer_encoding off;
+}
+
 # Model proxy backend
 location ~ ^/model-proxy/(.*)$ {
   proxy_pass {{MODEL_PROXY_URI}}/$1$is_args$args;

From 372cfe8406f306fd3a401bbf84300ad8abe19f11 Mon Sep 17 00:00:00 2001
From: Lei Qu <59161330+quge009@users.noreply.github.com>
Date: Tue, 28 Oct 2025 13:41:17 -0700
Subject: [PATCH 25/34] Update
 src/copilot-chat/src/copilot_agent/copilot_conversation.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/copilot-chat/src/copilot_agent/copilot_conversation.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/copilot-chat/src/copilot_agent/copilot_conversation.py b/src/copilot-chat/src/copilot_agent/copilot_conversation.py
index 77d86767..33091f9a 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_conversation.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_conversation.py
@@ -199,10 +199,6 @@ def _handle_user_question(self, user_id, conv_id, turn_id, user_prompt, skip_sum
             'type': 'answer',
             'timestampUnit': 'ms',
         }
-        # try:
-        #     push_frontend_meta(response_message_info)
-        # except Exception:
-        #     logger.debug('Failed to push early meta event for streaming client')
         resp['messageInfo'] = response_message_info
         debug_info = resp.get('debug')
         msg_add_kusto_resp = debug_info.get('kusto_response', None) if debug_info is not None else None

From 3c41e327ae8176d1b0b57174e911b7cf14e6f800 Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Tue, 28 Oct 2025 13:46:17 -0700
Subject: [PATCH 26/34] remove unnecessary comment

---
 src/copilot-chat/src/copilot_agent/utils/logger.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/copilot-chat/src/copilot_agent/utils/logger.py b/src/copilot-chat/src/copilot_agent/utils/logger.py
index abfcfead..eda1841e 100644
--- a/src/copilot-chat/src/copilot_agent/utils/logger.py
+++ b/src/copilot-chat/src/copilot_agent/utils/logger.py
@@ -56,7 +56,6 @@ def info(self, msg):
         try:
             print(f"{Fore.GREEN}[INFO]{Style.RESET_ALL} {msg}", flush=True)
         except (BrokenPipeError, OSError):
-            # stdout/stderr closed (client disconnected); fallback to Python logging
             logging.getLogger().info(msg)
     def error(self, msg):
         try:

From 32a2906b4b31f019b7d52d0c22082519d25e7aa4 Mon Sep 17 00:00:00 2001
From: Lei Qu <59161330+quge009@users.noreply.github.com>
Date: Tue, 28 Oct 2025 13:48:03 -0700
Subject: [PATCH 27/34] Update
 src/copilot-chat/src/copilot_agent/copilot_turn.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/copilot-chat/src/copilot_agent/copilot_turn.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/copilot-chat/src/copilot_agent/copilot_turn.py b/src/copilot-chat/src/copilot_agent/copilot_turn.py
index ab04641a..f8392cb0 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_turn.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_turn.py
@@ -52,12 +52,6 @@ def process_turn(self, messages_list: list, skip_summary: bool = False, debuggin
         last_inquiry = messages_list[-3]['content'] if len(messages_list) > 2 else None
 
         # debug only
-        if False:
-            question = this_inquiry
-            question_type = {'lv0_object': 'none', 'lv1_concern': 'none'}
-            answer, debug = self.processor.query_all_in_one(question, self.help_msg, skip_summary)
-            debug = {}
-            return {'category': question_type, 'answer': answer, 'debug': debug}
         
         if self._version == 'f4':
             push_frontend_event('<span class="text-gray-400 italic">🧠 Copilot is understanding your request...</span><br/>', replace=False)

From 4019f8aaa16c912a1da02358927d72176f680646 Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Tue, 28 Oct 2025 13:56:27 -0700
Subject: [PATCH 28/34] resolve review comment: remove consle log

---
 .../copilot-plugin/src/app/ChatHistory.tsx    | 38 +++----------------
 1 file changed, 5 insertions(+), 33 deletions(-)

diff --git a/contrib/copilot-plugin/src/app/ChatHistory.tsx b/contrib/copilot-plugin/src/app/ChatHistory.tsx
index 76c94c1c..b868f035 100644
--- a/contrib/copilot-plugin/src/app/ChatHistory.tsx
+++ b/contrib/copilot-plugin/src/app/ChatHistory.tsx
@@ -322,7 +322,6 @@ const GroupedChatMessages: React.FC = () => {
          const overflowY = style.overflowY
          if ((overflowY === 'auto' || overflowY === 'scroll' || overflowY === 'overlay') && cur.scrollHeight > cur.clientHeight) {
            cur.scrollTop = cur.scrollHeight
-           console.debug('[ChatHistory] scrollToBottom scrolled element', cur.tagName, { newScrollTop: cur.scrollTop })
            return
          }
        } catch (e) {
@@ -334,9 +333,8 @@ const GroupedChatMessages: React.FC = () => {
      // fallback to window/document
      try {
        window.scrollTo(0, document.documentElement.scrollHeight)
-       console.debug('[ChatHistory] scrollToBottom scrolled window', document.documentElement.scrollHeight)
      } catch (e) {
-       console.debug('[ChatHistory] scrollToBottom window scroll failed', e)
+       // ignore
      }
    }
 
@@ -344,39 +342,18 @@ const GroupedChatMessages: React.FC = () => {
      const el = scrollRef.current
      if (!el) return
 
-     // Debug: log scroll metrics on message updates
-     try {
-       const now = Date.now()
-       const scrollHeight = el.scrollHeight
-       const clientHeight = el.clientHeight
-       const scrollTop = el.scrollTop
-       const distanceFromBottom = scrollHeight - scrollTop - clientHeight
-       console.debug(`[ChatHistory] messages changed @${now} count=${groupedMessages.length}`, {
-         scrollHeight,
-         clientHeight,
-         scrollTop,
-         distanceFromBottom,
-         prevCount: prevCountRef.current,
-         lastText: lastTextRef.current?.slice(0, 200) || '(none)'
-       })
-     } catch (err) {
-       console.debug('[ChatHistory] debug log failed', err)
-     }
-
      const distanceFromBottom = el.scrollHeight - el.scrollTop - el.clientHeight
      const shouldScroll =
        distanceFromBottom < NEAR_BOTTOM_THRESHOLD &&
        (prevCountRef.current != groupedMessages.length || lastTextRef.current != lastText)
 
      if (shouldScroll) {
-       console.debug('[ChatHistory] will auto-scroll (requestAnimationFrame)')
        // try smooth scrolling in next frame
        requestAnimationFrame(() => {
          try {
            scrollToBottom(el)
-           console.debug('[ChatHistory] did auto-scroll (requestAnimationFrame)')
          } catch (e) {
-           console.debug('[ChatHistory] auto-scroll (RAF) failed', e)
+           // ignore
          }
        })
 
@@ -384,9 +361,8 @@ const GroupedChatMessages: React.FC = () => {
        setTimeout(() => {
          try {
            scrollToBottom(el)
-           console.debug('[ChatHistory] did auto-scroll (timeout)')
          } catch (e) {
-           console.debug('[ChatHistory] auto-scroll (timeout) failed', e)
+           // ignore
          }
        }, 120)
      }
@@ -402,22 +378,18 @@ const GroupedChatMessages: React.FC = () => {
 
      const observer = new MutationObserver((mutations) => {
        try {
-         console.debug('[ChatHistory] MutationObserver triggered, mutations:', mutations.length)
          const distanceFromBottom = el.scrollHeight - el.scrollTop - el.clientHeight
-         console.debug('[ChatHistory] MutationObserver metrics', { scrollTop: el.scrollTop, scrollHeight: el.scrollHeight, clientHeight: el.clientHeight, distanceFromBottom })
          if (distanceFromBottom < NEAR_BOTTOM_THRESHOLD) {
-           console.debug('[ChatHistory] MutationObserver will auto-scroll (RAF)')
            requestAnimationFrame(() => {
              try {
                scrollToBottom(el)
-               console.debug('[ChatHistory] MutationObserver did auto-scroll')
              } catch (e) {
-               console.debug('[ChatHistory] MutationObserver auto-scroll failed', e)
+               // ignore
              }
            })
          }
        } catch (e) {
-         console.debug('[ChatHistory] MutationObserver callback error', e)
+         // ignore
        }
      })
 

From f8a5211493a803114a0bb4c953a4ecca62ba5ed1 Mon Sep 17 00:00:00 2001
From: Lei Qu <59161330+quge009@users.noreply.github.com>
Date: Tue, 28 Oct 2025 13:59:12 -0700
Subject: [PATCH 29/34] Update
 src/copilot-chat/src/copilot_agent/copilot_service.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/copilot-chat/src/copilot_agent/copilot_service.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/copilot-chat/src/copilot_agent/copilot_service.py b/src/copilot-chat/src/copilot_agent/copilot_service.py
index d2c7d7df..3544ba98 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_service.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_service.py
@@ -68,8 +68,6 @@ def instance_operation(self):
             user_id = data['data']['messageInfo']['userId']
             conv_id = data['data']['messageInfo']['convId']
             copilot_conversation = self.get_or_create_session(user_id, conv_id)
-            llm_session = copilot_conversation.llm_session
-
             
             in_parameters = copilot_conversation.build_in_parameters(data)
             out_parameters = copilot_conversation.perform_operation(in_parameters)

From 66d3a90cbe218117af9e8821f295ad8f5a2462bd Mon Sep 17 00:00:00 2001
From: Lei Qu <59161330+quge009@users.noreply.github.com>
Date: Tue, 28 Oct 2025 14:00:11 -0700
Subject: [PATCH 30/34] Update src/copilot-chat/src/copilot_agent/ltp/ltp.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/copilot-chat/src/copilot_agent/ltp/ltp.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/copilot-chat/src/copilot_agent/ltp/ltp.py b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
index affc9afc..b95a1695 100644
--- a/src/copilot-chat/src/copilot_agent/ltp/ltp.py
+++ b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
@@ -277,7 +277,6 @@ def _make_json_serializable(data):
 
     def query_all_in_one(self, question: str, help_msg, skip_summary: bool = False):
         """Query all in one big session."""
-        ltp_doc = {'lucia training platform documentation': self.ltp_documentation}
 
         sys_prompt0 = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'gen_result_summary_doc.txt'))
         sys_prompt1 = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'gen_result_summary_metrics.txt'))

From 735ec38782eb2151d6ae563882b22719375d8118 Mon Sep 17 00:00:00 2001
From: Lei Qu <59161330+quge009@users.noreply.github.com>
Date: Tue, 28 Oct 2025 14:00:47 -0700
Subject: [PATCH 31/34] Update
 src/copilot-chat/src/copilot_agent/copilot_conversation.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/copilot-chat/src/copilot_agent/copilot_conversation.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/copilot-chat/src/copilot_agent/copilot_conversation.py b/src/copilot-chat/src/copilot_agent/copilot_conversation.py
index 33091f9a..ded85627 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_conversation.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_conversation.py
@@ -13,7 +13,6 @@
 from .utils.logger import logger
 from .utils.authentication import AuthenticationManager
 from .utils.kql_executor import KustoExecutor
-from .utils.push_frontend import push_frontend_event, push_frontend_meta
 
 from .config import AGENT_MINIMAL_ON, print_env_variables
 from .copilot_turn import CoPilotTurn

From 53ddf4dc755bca1f73e8bcf1d874b7040c4fc026 Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Tue, 28 Oct 2025 14:02:36 -0700
Subject: [PATCH 32/34] update: remove unused function

---
 src/copilot-chat/src/copilot_agent/ltp/ltp.py | 24 -------------------
 1 file changed, 24 deletions(-)

diff --git a/src/copilot-chat/src/copilot_agent/ltp/ltp.py b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
index b95a1695..e03000f3 100644
--- a/src/copilot-chat/src/copilot_agent/ltp/ltp.py
+++ b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
@@ -274,27 +274,3 @@ def _make_json_serializable(data):
         else:
             # Return the object as is if it's already serializable
             return data
-
-    def query_all_in_one(self, question: str, help_msg, skip_summary: bool = False):
-        """Query all in one big session."""
-
-        sys_prompt0 = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'gen_result_summary_doc.txt'))
-        sys_prompt1 = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'gen_result_summary_metrics.txt'))
-        sys_prompt2 = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'gen_result_summary_metadata.txt'))
-        sys_prompt3 = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'gen_result_summary_dashboard.txt'))
-        sys_prompt4 = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'gen_result_summary_rejection.txt'))
-        sys_prompt5 = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'gen_result_summary_human.txt'))
-        sys_prompt = (
-            sys_prompt0 + '\n\n' +
-            sys_prompt1 + '\n\n' +
-            sys_prompt2 + '\n\n' +
-            sys_prompt3 + '\n\n' +
-            sys_prompt4 + '\n\n' +
-            sys_prompt5 + '\n\n the user manual of lucia training platform is: \n\n' +
-            self.ltp_documentation
-        )
-
-        summary = self.llm_session.chat(sys_prompt, question)
-
-        info_dict = {}
-        return summary, info_dict
\ No newline at end of file

From 34b95ae88340d23b0a9d0cbfa860289a686cdda1 Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Tue, 28 Oct 2025 14:30:09 -0700
Subject: [PATCH 33/34] improve: robustness, gracefully handling if
 classification fail

---
 src/copilot-chat/src/copilot_agent/copilot_turn.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/copilot-chat/src/copilot_agent/copilot_turn.py b/src/copilot-chat/src/copilot_agent/copilot_turn.py
index f8392cb0..98d09616 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_turn.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_turn.py
@@ -72,7 +72,12 @@ def process_turn(self, messages_list: list, skip_summary: bool = False, debuggin
         push_frontend_event('<span class="text-gray-400 italic">⏳ Copilot is processing your inquiry...</span><br/>', replace=False)
         self.smart_help.llm_session = self.llm_session  # ensure processor uses the current llm_session
         if self._version in ['f3', 'f4']:
-            if obj.count('8') > 0:
+            # If classification failed, treat as unsupported.
+            if obj is None or con is None:
+                help_keys = ['unsupported_question']
+                answer = self.smart_help.generate(question, help_keys, True)
+                debug = {}
+            elif obj.count('8') > 0:
                 answer, debug = self.query_ltp(question, con, skip_summary)
             elif obj.count('3') > 0:
                 answer = self.gen_answer_general(question)
@@ -80,12 +85,10 @@ def process_turn(self, messages_list: list, skip_summary: bool = False, debuggin
             elif obj.count('9') > 0:
                 help_keys = ['feature']
                 answer = self.smart_help.generate(question, help_keys, True)
-                
                 debug = {}
             else:
                 help_keys = ['unsupported_question']
                 answer = self.smart_help.generate(question, help_keys, True)
-
                 debug = {}
         else:
             # Placeholder for other version implementations

From cb34b54af8d6d43903a22cf7340efdf2627a4928 Mon Sep 17 00:00:00 2001
From: Lei <lequ@microsoft.com>
Date: Tue, 28 Oct 2025 16:33:49 -0700
Subject: [PATCH 34/34] change classifier version for deployment

---
 src/copilot-chat/config/copilot-chat.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/copilot-chat/config/copilot-chat.yaml b/src/copilot-chat/config/copilot-chat.yaml
index 2adb9e0b..1675cb29 100644
--- a/src/copilot-chat/config/copilot-chat.yaml
+++ b/src/copilot-chat/config/copilot-chat.yaml
@@ -21,7 +21,7 @@ agent-host: "0.0.0.0"
 agent-port: "50000"
 secure-port: "8443"
 history-depth: "64"
-version: "f3"
+version: "f4"
 agent-mode: "remote"
 agent-mode-ca: "local"
 azure-openai-api-key: "<your-api-key>"