diff --git a/contrib/copilot-plugin/package-lock.json b/contrib/copilot-plugin/package-lock.json
index 106ab43d..2f0b3bfa 100644
--- a/contrib/copilot-plugin/package-lock.json
+++ b/contrib/copilot-plugin/package-lock.json
@@ -57,6 +57,7 @@
         "react-icons": "^5.5.0",
         "react-markdown": "^10.1.0",
         "react-refresh": "^0.11.0",
+        "rehype-raw": "^6.1.1",
         "remark-gfm": "^4.0.1",
         "resolve": "^1.20.0",
         "resolve-url-loader": "^4.0.0",
@@ -3948,6 +3949,12 @@
       "integrity": "sha512-dISoDXWWQwUquiKsyZ4Ng+HX2KsPL7LyHKHQwgGFEA3IaKac4Obd+h2a/a6waisAoepJlBcx9paWqjA8/HVjCw==",
       "license": "MIT"
     },
+    "node_modules/@types/parse5": {
+      "version": "6.0.3",
+      "resolved": "https://registry.npmjs.org/@types/parse5/-/parse5-6.0.3.tgz",
+      "integrity": "sha512-SuT16Q1K51EAVPz1K29DJ/sXjhSQ0zjvsypYJ6tlwVsRV9jwW5Adq2ch8Dq8kDBCkYnELS7N7VNCSB5nC56t/g==",
+      "license": "MIT"
+    },
     "node_modules/@types/prettier": {
       "version": "2.7.3",
       "resolved": "https://registry.npmjs.org/@types/prettier/-/prettier-2.7.3.tgz",
@@ -9532,6 +9539,257 @@
         "node": ">= 0.4"
       }
     },
+    "node_modules/hast-util-from-parse5": {
+      "version": "7.1.2",
+      "resolved": "https://registry.npmjs.org/hast-util-from-parse5/-/hast-util-from-parse5-7.1.2.tgz",
+      "integrity": "sha512-Nz7FfPBuljzsN3tCQ4kCBKqdNhQE2l0Tn+X1ubgKBPRoiDIu1mL08Cfw4k7q71+Duyaw7DXDN+VTAp4Vh3oCOw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^2.0.0",
+        "@types/unist": "^2.0.0",
+        "hastscript": "^7.0.0",
+        "property-information": "^6.0.0",
+        "vfile": "^5.0.0",
+        "vfile-location": "^4.0.0",
+        "web-namespaces": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-from-parse5/node_modules/@types/hast": {
+      "version": "2.3.10",
+      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-2.3.10.tgz",
+      "integrity": "sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2"
+      }
+    },
+    "node_modules/hast-util-from-parse5/node_modules/@types/unist": {
+      "version": "2.0.11",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz",
+      "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==",
+      "license": "MIT"
+    },
+    "node_modules/hast-util-from-parse5/node_modules/property-information": {
+      "version": "6.5.0",
+      "resolved": "https://registry.npmjs.org/property-information/-/property-information-6.5.0.tgz",
+      "integrity": "sha512-PgTgs/BlvHxOu8QuEN7wi5A0OmXaBcHpmCSTehcs6Uuu9IkDIEo13Hy7n898RHfrQ49vKCoGeWZSaAK01nwVig==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
+    "node_modules/hast-util-from-parse5/node_modules/unist-util-stringify-position": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-3.0.3.tgz",
+      "integrity": "sha512-k5GzIBZ/QatR8N5X2y+drfpWG8IDBzdnVj6OInRNWm1oXrzydiaAT2OQiA8DPRRZyAKb9b6I2a6PxYklZD0gKg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-from-parse5/node_modules/vfile": {
+      "version": "5.3.7",
+      "resolved": "https://registry.npmjs.org/vfile/-/vfile-5.3.7.tgz",
+      "integrity": "sha512-r7qlzkgErKjobAmyNIkkSpizsFPYiUPuJb5pNW1RB4JcYVZhs4lIbVqk8XPk033CV/1z8ss5pkax8SuhGpcG8g==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "is-buffer": "^2.0.0",
+        "unist-util-stringify-position": "^3.0.0",
+        "vfile-message": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-from-parse5/node_modules/vfile-message": {
+      "version": "3.1.4",
+      "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-3.1.4.tgz",
+      "integrity": "sha512-fa0Z6P8HUrQN4BZaX05SIVXic+7kE3b05PWAtPuYP9QLHsLKYR7/AlLW3NtOrpXRLeawpDLMsVkmk5DG0NXgWw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "unist-util-stringify-position": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-parse-selector": {
+      "version": "3.1.1",
+      "resolved": "https://registry.npmjs.org/hast-util-parse-selector/-/hast-util-parse-selector-3.1.1.tgz",
+      "integrity": "sha512-jdlwBjEexy1oGz0aJ2f4GKMaVKkA9jwjr4MjAAI22E5fM/TXVZHuS5OpONtdeIkRKqAaryQ2E9xNQxijoThSZA==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-parse-selector/node_modules/@types/hast": {
+      "version": "2.3.10",
+      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-2.3.10.tgz",
+      "integrity": "sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2"
+      }
+    },
+    "node_modules/hast-util-parse-selector/node_modules/@types/unist": {
+      "version": "2.0.11",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz",
+      "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==",
+      "license": "MIT"
+    },
+    "node_modules/hast-util-raw": {
+      "version": "7.2.3",
+      "resolved": "https://registry.npmjs.org/hast-util-raw/-/hast-util-raw-7.2.3.tgz",
+      "integrity": "sha512-RujVQfVsOrxzPOPSzZFiwofMArbQke6DJjnFfceiEbFh7S05CbPt0cYN+A5YeD3pso0JQk6O1aHBnx9+Pm2uqg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^2.0.0",
+        "@types/parse5": "^6.0.0",
+        "hast-util-from-parse5": "^7.0.0",
+        "hast-util-to-parse5": "^7.0.0",
+        "html-void-elements": "^2.0.0",
+        "parse5": "^6.0.0",
+        "unist-util-position": "^4.0.0",
+        "unist-util-visit": "^4.0.0",
+        "vfile": "^5.0.0",
+        "web-namespaces": "^2.0.0",
+        "zwitch": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-raw/node_modules/@types/hast": {
+      "version": "2.3.10",
+      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-2.3.10.tgz",
+      "integrity": "sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2"
+      }
+    },
+    "node_modules/hast-util-raw/node_modules/@types/unist": {
+      "version": "2.0.11",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz",
+      "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==",
+      "license": "MIT"
+    },
+    "node_modules/hast-util-raw/node_modules/unist-util-is": {
+      "version": "5.2.1",
+      "resolved": "https://registry.npmjs.org/unist-util-is/-/unist-util-is-5.2.1.tgz",
+      "integrity": "sha512-u9njyyfEh43npf1M+yGKDGVPbY/JWEemg5nH05ncKPfi+kBbKBJoTdsogMu33uhytuLlv9y0O7GH7fEdwLdLQw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-raw/node_modules/unist-util-position": {
+      "version": "4.0.4",
+      "resolved": "https://registry.npmjs.org/unist-util-position/-/unist-util-position-4.0.4.tgz",
+      "integrity": "sha512-kUBE91efOWfIVBo8xzh/uZQ7p9ffYRtUbMRZBNFYwf0RK8koUMx6dGUfwylLOKmaT2cs4wSW96QoYUSXAyEtpg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-raw/node_modules/unist-util-stringify-position": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-3.0.3.tgz",
+      "integrity": "sha512-k5GzIBZ/QatR8N5X2y+drfpWG8IDBzdnVj6OInRNWm1oXrzydiaAT2OQiA8DPRRZyAKb9b6I2a6PxYklZD0gKg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-raw/node_modules/unist-util-visit": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/unist-util-visit/-/unist-util-visit-4.1.2.tgz",
+      "integrity": "sha512-MSd8OUGISqHdVvfY9TPhyK2VdUrPgxkUtWSuMHF6XAAFuL4LokseigBnZtPnJMu+FbynTkFNnFlyjxpVKujMRg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "unist-util-is": "^5.0.0",
+        "unist-util-visit-parents": "^5.1.1"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-raw/node_modules/unist-util-visit-parents": {
+      "version": "5.1.3",
+      "resolved": "https://registry.npmjs.org/unist-util-visit-parents/-/unist-util-visit-parents-5.1.3.tgz",
+      "integrity": "sha512-x6+y8g7wWMyQhL1iZfhIPhDAs7Xwbn9nRosDXl7qoPTSCy0yNxnKc+hWokFifWQIDGi154rdUqKvbCa4+1kLhg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "unist-util-is": "^5.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-raw/node_modules/vfile": {
+      "version": "5.3.7",
+      "resolved": "https://registry.npmjs.org/vfile/-/vfile-5.3.7.tgz",
+      "integrity": "sha512-r7qlzkgErKjobAmyNIkkSpizsFPYiUPuJb5pNW1RB4JcYVZhs4lIbVqk8XPk033CV/1z8ss5pkax8SuhGpcG8g==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "is-buffer": "^2.0.0",
+        "unist-util-stringify-position": "^3.0.0",
+        "vfile-message": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-raw/node_modules/vfile-message": {
+      "version": "3.1.4",
+      "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-3.1.4.tgz",
+      "integrity": "sha512-fa0Z6P8HUrQN4BZaX05SIVXic+7kE3b05PWAtPuYP9QLHsLKYR7/AlLW3NtOrpXRLeawpDLMsVkmk5DG0NXgWw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "unist-util-stringify-position": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
     "node_modules/hast-util-to-jsx-runtime": {
       "version": "2.3.6",
       "resolved": "https://registry.npmjs.org/hast-util-to-jsx-runtime/-/hast-util-to-jsx-runtime-2.3.6.tgz",
@@ -9559,6 +9817,49 @@
         "url": "https://opencollective.com/unified"
       }
     },
+    "node_modules/hast-util-to-parse5": {
+      "version": "7.1.0",
+      "resolved": "https://registry.npmjs.org/hast-util-to-parse5/-/hast-util-to-parse5-7.1.0.tgz",
+      "integrity": "sha512-YNRgAJkH2Jky5ySkIqFXTQiaqcAtJyVE+D5lkN6CdtOqrnkLfGYYrEcKuHOJZlp+MwjSwuD3fZuawI+sic/RBw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^2.0.0",
+        "comma-separated-tokens": "^2.0.0",
+        "property-information": "^6.0.0",
+        "space-separated-tokens": "^2.0.0",
+        "web-namespaces": "^2.0.0",
+        "zwitch": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hast-util-to-parse5/node_modules/@types/hast": {
+      "version": "2.3.10",
+      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-2.3.10.tgz",
+      "integrity": "sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2"
+      }
+    },
+    "node_modules/hast-util-to-parse5/node_modules/@types/unist": {
+      "version": "2.0.11",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz",
+      "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==",
+      "license": "MIT"
+    },
+    "node_modules/hast-util-to-parse5/node_modules/property-information": {
+      "version": "6.5.0",
+      "resolved": "https://registry.npmjs.org/property-information/-/property-information-6.5.0.tgz",
+      "integrity": "sha512-PgTgs/BlvHxOu8QuEN7wi5A0OmXaBcHpmCSTehcs6Uuu9IkDIEo13Hy7n898RHfrQ49vKCoGeWZSaAK01nwVig==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
     "node_modules/hast-util-whitespace": {
       "version": "3.0.0",
       "resolved": "https://registry.npmjs.org/hast-util-whitespace/-/hast-util-whitespace-3.0.0.tgz",
@@ -9572,6 +9873,48 @@
         "url": "https://opencollective.com/unified"
       }
     },
+    "node_modules/hastscript": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/hastscript/-/hastscript-7.2.0.tgz",
+      "integrity": "sha512-TtYPq24IldU8iKoJQqvZOuhi5CyCQRAbvDOX0x1eW6rsHSxa/1i2CCiptNTotGHJ3VoHRGmqiv6/D3q113ikkw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^2.0.0",
+        "comma-separated-tokens": "^2.0.0",
+        "hast-util-parse-selector": "^3.0.0",
+        "property-information": "^6.0.0",
+        "space-separated-tokens": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/hastscript/node_modules/@types/hast": {
+      "version": "2.3.10",
+      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-2.3.10.tgz",
+      "integrity": "sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2"
+      }
+    },
+    "node_modules/hastscript/node_modules/@types/unist": {
+      "version": "2.0.11",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz",
+      "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==",
+      "license": "MIT"
+    },
+    "node_modules/hastscript/node_modules/property-information": {
+      "version": "6.5.0",
+      "resolved": "https://registry.npmjs.org/property-information/-/property-information-6.5.0.tgz",
+      "integrity": "sha512-PgTgs/BlvHxOu8QuEN7wi5A0OmXaBcHpmCSTehcs6Uuu9IkDIEo13Hy7n898RHfrQ49vKCoGeWZSaAK01nwVig==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
     "node_modules/he": {
       "version": "1.2.0",
       "resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz",
@@ -9714,6 +10057,16 @@
         "url": "https://opencollective.com/unified"
       }
     },
+    "node_modules/html-void-elements": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/html-void-elements/-/html-void-elements-2.0.1.tgz",
+      "integrity": "sha512-0quDb7s97CfemeJAnW9wC0hw78MtW7NU3hqtCD75g2vFlDLt36llsYD7uB7SUzojLMP24N5IatXf7ylGXiGG9A==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
     "node_modules/html-webpack-plugin": {
       "version": "5.6.3",
       "resolved": "https://registry.npmjs.org/html-webpack-plugin/-/html-webpack-plugin-5.6.3.tgz",
@@ -10151,6 +10504,29 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
+    "node_modules/is-buffer": {
+      "version": "2.0.5",
+      "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-2.0.5.tgz",
+      "integrity": "sha512-i2R6zNFDwgEHJyQUtJEk0XFi1i0dPFn/oqjK3/vPCcDeJvW5NQ83V8QbicfF1SupOaB0h8ntgBC2YiE7dfyctQ==",
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ],
+      "license": "MIT",
+      "engines": {
+        "node": ">=4"
+      }
+    },
     "node_modules/is-callable": {
       "version": "1.2.7",
       "resolved": "https://registry.npmjs.org/is-callable/-/is-callable-1.2.7.tgz",
@@ -16004,6 +16380,110 @@
         "node": ">=6"
       }
     },
+    "node_modules/rehype-raw": {
+      "version": "6.1.1",
+      "resolved": "https://registry.npmjs.org/rehype-raw/-/rehype-raw-6.1.1.tgz",
+      "integrity": "sha512-d6AKtisSRtDRX4aSPsJGTfnzrX2ZkHQLE5kiUuGOeEoLpbEulFF4hj0mLPbsa+7vmguDKOVVEQdHKDSwoaIDsQ==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/hast": "^2.0.0",
+        "hast-util-raw": "^7.2.0",
+        "unified": "^10.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/rehype-raw/node_modules/@types/hast": {
+      "version": "2.3.10",
+      "resolved": "https://registry.npmjs.org/@types/hast/-/hast-2.3.10.tgz",
+      "integrity": "sha512-McWspRw8xx8J9HurkVBfYj0xKoE25tOFlHGdx4MJ5xORQrMGZNqJhVQWaIbm6Oyla5kYOXtDiopzKRJzEOkwJw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2"
+      }
+    },
+    "node_modules/rehype-raw/node_modules/@types/unist": {
+      "version": "2.0.11",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz",
+      "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==",
+      "license": "MIT"
+    },
+    "node_modules/rehype-raw/node_modules/is-plain-obj": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-4.1.0.tgz",
+      "integrity": "sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg==",
+      "license": "MIT",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/rehype-raw/node_modules/unified": {
+      "version": "10.1.2",
+      "resolved": "https://registry.npmjs.org/unified/-/unified-10.1.2.tgz",
+      "integrity": "sha512-pUSWAi/RAnVy1Pif2kAoeWNBa3JVrx0MId2LASj8G+7AiHWoKZNTomq6LG326T68U7/e263X6fTdcXIy7XnF7Q==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "bail": "^2.0.0",
+        "extend": "^3.0.0",
+        "is-buffer": "^2.0.0",
+        "is-plain-obj": "^4.0.0",
+        "trough": "^2.0.0",
+        "vfile": "^5.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/rehype-raw/node_modules/unist-util-stringify-position": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-3.0.3.tgz",
+      "integrity": "sha512-k5GzIBZ/QatR8N5X2y+drfpWG8IDBzdnVj6OInRNWm1oXrzydiaAT2OQiA8DPRRZyAKb9b6I2a6PxYklZD0gKg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/rehype-raw/node_modules/vfile": {
+      "version": "5.3.7",
+      "resolved": "https://registry.npmjs.org/vfile/-/vfile-5.3.7.tgz",
+      "integrity": "sha512-r7qlzkgErKjobAmyNIkkSpizsFPYiUPuJb5pNW1RB4JcYVZhs4lIbVqk8XPk033CV/1z8ss5pkax8SuhGpcG8g==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "is-buffer": "^2.0.0",
+        "unist-util-stringify-position": "^3.0.0",
+        "vfile-message": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/rehype-raw/node_modules/vfile-message": {
+      "version": "3.1.4",
+      "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-3.1.4.tgz",
+      "integrity": "sha512-fa0Z6P8HUrQN4BZaX05SIVXic+7kE3b05PWAtPuYP9QLHsLKYR7/AlLW3NtOrpXRLeawpDLMsVkmk5DG0NXgWw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "unist-util-stringify-position": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
     "node_modules/relateurl": {
       "version": "0.2.7",
       "resolved": "https://registry.npmjs.org/relateurl/-/relateurl-0.2.7.tgz",
@@ -18816,6 +19296,69 @@
         "url": "https://opencollective.com/unified"
       }
     },
+    "node_modules/vfile-location": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/vfile-location/-/vfile-location-4.1.0.tgz",
+      "integrity": "sha512-YF23YMyASIIJXpktBa4vIGLJ5Gs88UB/XePgqPmTa7cDA+JeO3yclbpheQYCHjVHBn/yePzrXuygIL+xbvRYHw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "vfile": "^5.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/vfile-location/node_modules/@types/unist": {
+      "version": "2.0.11",
+      "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz",
+      "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==",
+      "license": "MIT"
+    },
+    "node_modules/vfile-location/node_modules/unist-util-stringify-position": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-3.0.3.tgz",
+      "integrity": "sha512-k5GzIBZ/QatR8N5X2y+drfpWG8IDBzdnVj6OInRNWm1oXrzydiaAT2OQiA8DPRRZyAKb9b6I2a6PxYklZD0gKg==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/vfile-location/node_modules/vfile": {
+      "version": "5.3.7",
+      "resolved": "https://registry.npmjs.org/vfile/-/vfile-5.3.7.tgz",
+      "integrity": "sha512-r7qlzkgErKjobAmyNIkkSpizsFPYiUPuJb5pNW1RB4JcYVZhs4lIbVqk8XPk033CV/1z8ss5pkax8SuhGpcG8g==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "is-buffer": "^2.0.0",
+        "unist-util-stringify-position": "^3.0.0",
+        "vfile-message": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
+    "node_modules/vfile-location/node_modules/vfile-message": {
+      "version": "3.1.4",
+      "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-3.1.4.tgz",
+      "integrity": "sha512-fa0Z6P8HUrQN4BZaX05SIVXic+7kE3b05PWAtPuYP9QLHsLKYR7/AlLW3NtOrpXRLeawpDLMsVkmk5DG0NXgWw==",
+      "license": "MIT",
+      "dependencies": {
+        "@types/unist": "^2.0.0",
+        "unist-util-stringify-position": "^3.0.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/unified"
+      }
+    },
     "node_modules/vfile-message": {
       "version": "4.0.2",
       "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-4.0.2.tgz",
@@ -18883,6 +19426,16 @@
         "minimalistic-assert": "^1.0.0"
       }
     },
+    "node_modules/web-namespaces": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/web-namespaces/-/web-namespaces-2.0.1.tgz",
+      "integrity": "sha512-bKr1DkiNa2krS7qxNtdrtHAmzuYGFQLiQ13TsorsdT6ULTkPLKuu5+GsFpDlg6JFjUTwX2DyhMPG2be8uPrqsQ==",
+      "license": "MIT",
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
     "node_modules/web-vitals": {
       "version": "2.1.4",
       "resolved": "https://registry.npmjs.org/web-vitals/-/web-vitals-2.1.4.tgz",
diff --git a/contrib/copilot-plugin/package.json b/contrib/copilot-plugin/package.json
index 1008a6b0..5888a3ce 100644
--- a/contrib/copilot-plugin/package.json
+++ b/contrib/copilot-plugin/package.json
@@ -51,6 +51,7 @@
     "react-dom": "^18.3.1",
     "react-icons": "^5.5.0",
     "react-markdown": "^10.1.0",
+    "rehype-raw": "^6.1.1",
     "react-refresh": "^0.11.0",
     "remark-gfm": "^4.0.1",
     "resolve": "^1.20.0",
@@ -77,6 +78,7 @@
     "start": "node scripts/start.js",
     "start:user1": "REACT_APP_USER=dev.eva PORT=3000 node scripts/start.js",
     "start:user2": "REACT_APP_USER=dev.ben PORT=3001 node scripts/start.js",
+    "start:user3": "REACT_APP_USER=dev.unknown PORT=3002 node scripts/start.js",
     "build": "node scripts/build.js",
     "test": "node scripts/test.js",
     "clean": "rimraf build"
diff --git a/contrib/copilot-plugin/src/app/ChatBox.tsx b/contrib/copilot-plugin/src/app/ChatBox.tsx
index 949edddc..779e4a01 100644
--- a/contrib/copilot-plugin/src/app/ChatBox.tsx
+++ b/contrib/copilot-plugin/src/app/ChatBox.tsx
@@ -21,8 +21,8 @@ export default function ChatBox() {
   // Use local backend when running the dev server (npm start),
   // and use the relative path for production builds (npm run build).
   const REMOTE_SERVER_URL = process.env.NODE_ENV === 'development'
-    ? 'http://127.0.0.1:60000/copilot/api/operation'
-    : '/copilot/api/operation';
+    ? 'http://127.0.0.1:60000/copilot/api/stream'
+    : '/copilot/api/stream';
 
   const makeChatRequest = async (e: React.FormEvent) => {
     e.preventDefault();
@@ -38,6 +38,17 @@ export default function ChatBox() {
     setPrompt("");
     setLoading(true);
     try {
+      // create a stable turnId and include it in the payload so server will echo/use it
+      const turnId = uuidv4();
+      const messageInfo = {
+        userId: paiuser,
+        convId: currentConversationId,
+        turnId: turnId,
+        timestamp: Math.floor(Date.now()),
+        timestampUnit: "ms",
+        type: "question",
+      };
+
       const payload = {
         async_: false,
         stream: false,
@@ -48,18 +59,14 @@ export default function ChatBox() {
             username: paiuser,
             restToken: restServerToken,
             jobToken: jobServerToken,
-            currentJob: null // currentJob ? { id: currentJob.id, name: currentJob.name, username: currentJob.username, status: currentJob.status, ip: currentJob.ip, port: currentJob.port } : null
+            currentJob: null
           },
-          messageInfo: {
-            userId: paiuser,
-            convId: currentConversationId,
-            turnId: uuidv4(),
-            timestamp: Math.floor(Date.now()),
-            timestampUnit: "ms",
-            type: "question",
-          }
+          messageInfo: messageInfo
         }
       };
+      
+      // Create assistant placeholder and attach the same messageInfo (turnId) so feedback maps to this response
+      useChatStore.getState().addChat({ role: "assistant", message: "", timestamp: new Date(), messageInfo });
       const response = await fetch(REMOTE_SERVER_URL, {
         method: "POST",
         headers: {
@@ -69,15 +76,93 @@ export default function ChatBox() {
         body: JSON.stringify(payload),
       });
       if (!response.ok) throw new Error("Remote server error");
-      const data = await response.json();
-      if (data?.data?.answer !== "skip") {
-        useChatStore.getState().addChat({
-          role: "assistant",
-          message: data?.data?.answer ?? "No answer found",
-          timestamp: new Date(),
-          messageInfo: data?.data?.message_info, // Store the message_info from response
-        });
+
+      const reader = response.body?.getReader();
+      if (!reader) throw new Error('No response body for streaming');
+      const decoder = new TextDecoder();
+      // Buffer incoming bytes and parse SSE-style messages (separated by '\n\n')
+      let buffer = '';
+      while (true) {
+        const { value, done: readerDone } = await reader.read();
+        if (value) {
+          buffer += decoder.decode(value, { stream: true });
+        }
+
+        // Process all complete SSE messages in buffer
+        let sepIndex;
+        while ((sepIndex = buffer.indexOf('\n\n')) !== -1) {
+          const rawEvent = buffer.slice(0, sepIndex);
+          buffer = buffer.slice(sepIndex + 2);
+
+          // Extract data: lines and join with newline to preserve original formatting
+          const lines = rawEvent.split(/\n/);
+          const dataParts: string[] = [];
+          let isDoneEvent = false;
+          for (const line of lines) {
+            if (line.startsWith('data:')) {
+              dataParts.push(line.slice(5));
+            } else if (line.startsWith('event:')) {
+              const ev = line.slice(6).trim();
+              if (ev === 'done') isDoneEvent = true;
+            }
+          }
+
+          if (dataParts.length > 0) {
+            const dataStr = dataParts.join('\n');
+            // If the server sent a JSON 'append' event, append to last assistant message
+            let handled = false;
+            const trimmed = dataStr.trim();
+            if (trimmed.startsWith('{')) {
+              try {
+                const parsed = JSON.parse(trimmed);
+                if (parsed && parsed.type === 'append' && typeof parsed.text === 'string') {
+                  useChatStore.getState().appendToLastAssistant(parsed.text);
+                  handled = true;
+                }
+                else if (parsed && parsed.type === 'meta' && parsed.messageInfo) {
+                  // attach backend-generated messageInfo (turnId etc.) to the last assistant message
+                  useChatStore.getState().setLastAssistantMessageInfo(parsed.messageInfo);
+                  handled = true;
+                }
+              } catch (e) {
+                // not JSON, fall through to full replace
+              }
+            }
+
+            if (!handled) {
+              // If server sent a full snapshot repeatedly (common when backend doesn't send structured append events),
+              // detect the already-displayed prefix and append only the new suffix. This avoids blinking and missing lines
+              // during rapid streaming of many list items.
+              const store = useChatStore.getState();
+              const msgs = store.chatMsgs;
+              let lastAssistant = "";
+              for (let i = msgs.length - 1; i >= 0; i--) {
+                if (msgs[i].role === 'assistant') {
+                  lastAssistant = msgs[i].message || '';
+                  break;
+                }
+              }
+
+              if (lastAssistant && dataStr.startsWith(lastAssistant)) {
+                const suffix = dataStr.slice(lastAssistant.length);
+                if (suffix.length > 0) store.appendToLastAssistant(suffix);
+              } else {
+                // Fallback: replace the last assistant message with the full reconstructed text
+                store.replaceLastAssistant(dataStr);
+              }
+             }
+          }
+
+          if (isDoneEvent) {
+            // stream finished
+            break;
+          }
+        }
+
+        if (readerDone) break;
       }
+
+      // After the streaming loop, do not alter the assembled markdown so newlines are preserved
     } catch (err) {
       toast.error("Failed to get response from remote server");
     }
diff --git a/contrib/copilot-plugin/src/app/ChatHistory.tsx b/contrib/copilot-plugin/src/app/ChatHistory.tsx
index 5f460d15..b868f035 100644
--- a/contrib/copilot-plugin/src/app/ChatHistory.tsx
+++ b/contrib/copilot-plugin/src/app/ChatHistory.tsx
@@ -11,6 +11,7 @@ import { Bot, User, ThumbsUp, ThumbsDown } from "lucide-react";
 import Markdown, { Components } from "react-markdown";
 
 import remarkGfm from "remark-gfm";
+import rehypeRaw from 'rehype-raw';
 import { ChatMessage, useChatStore } from "../libs/state";
 import { Pane } from "../components/pane";
 
@@ -101,6 +102,7 @@ const CustomMarkdown: React.FC<{ content: string }> = ({ content }) => {
     <div className={`prose-sm text-base break-words word-wrap`}>
       <Markdown
         remarkPlugins={[remarkGfm]}
+        rehypePlugins={[rehypeRaw as any]}
         components={{
           pre({ node, ...props }: any) {
             return <PreWithLineNumbers>{props.children}</PreWithLineNumbers>;
@@ -160,7 +162,7 @@ const Message: React.FC<{ message: ChatMessage, expand?: boolean, isAssistant?:
           messageInfo: {
             userId: paiuser,
             convId: currentConversationId,
-            turnId: uuidv4(), // Use message's turnId or fallback to "0"
+            turnId: message.messageInfo?.turnId || "0",
             timestamp: Math.floor(Date.now()),
             timestampUnit: "ms",
             type: "feedback",
@@ -298,13 +300,103 @@ const GroupedChatMessages: React.FC = () => {
   const messages = useChatStore((state) => state.chatMsgs);
   const scrollRef = useRef<HTMLDivElement>(null);
 
-  useEffect(() => {
-    if (scrollRef.current) {
-      scrollRef.current.scrollTop = scrollRef.current.scrollHeight;
-    }
-  }, [messages]);
-
+  // compute grouped messages and helper values early so effects can reference them
   const groupedMessages = groupMessages(messages);
+  const lastText = groupedMessages.length
+    ? groupedMessages[groupedMessages.length - 1].messages.map((m) => m.message).join('\n')
+    : ''
+  const NEAR_BOTTOM_THRESHOLD = 120
+
+   // Reliable auto-scroll for both new messages and streaming updates.
+   const prevCountRef = React.useRef<number>(0);
+   const lastTextRef = React.useRef<string>('');
+
+   // Scroll helper: find nearest scrollable element that actually overflows, otherwise fallback to window
+   const scrollToBottom = (startTarget?: HTMLElement | null) => {
+     const startEl = startTarget || scrollRef.current
+     if (!startEl) return
+     let cur: HTMLElement | null = startEl
+     while (cur && cur !== document.body) {
+       try {
+         const style = window.getComputedStyle(cur)
+         const overflowY = style.overflowY
+         if ((overflowY === 'auto' || overflowY === 'scroll' || overflowY === 'overlay') && cur.scrollHeight > cur.clientHeight) {
+           cur.scrollTop = cur.scrollHeight
+           return
+         }
+       } catch (e) {
+         // ignore
+       }
+       cur = cur.parentElement
+     }
+
+     // fallback to window/document
+     try {
+       window.scrollTo(0, document.documentElement.scrollHeight)
+     } catch (e) {
+       // ignore
+     }
+   }
+
+   useEffect(() => {
+     const el = scrollRef.current
+     if (!el) return
+
+     const distanceFromBottom = el.scrollHeight - el.scrollTop - el.clientHeight
+     const shouldScroll =
+       distanceFromBottom < NEAR_BOTTOM_THRESHOLD &&
+       (prevCountRef.current != groupedMessages.length || lastTextRef.current != lastText)
+
+     if (shouldScroll) {
+       // try smooth scrolling in next frame
+       requestAnimationFrame(() => {
+         try {
+           scrollToBottom(el)
+         } catch (e) {
+           // ignore
+         }
+       })
+
+       // fallback: ensure scroll after a short delay
+       setTimeout(() => {
+         try {
+           scrollToBottom(el)
+         } catch (e) {
+           // ignore
+         }
+       }, 120)
+     }
+
+     prevCountRef.current = groupedMessages.length
+     lastTextRef.current = lastText
+   }, [groupedMessages.length, lastText])
+
+   // observe DOM changes to catch streaming incremental updates
+   useEffect(() => {
+     const el = scrollRef.current
+     if (!el) return
+
+     const observer = new MutationObserver((mutations) => {
+       try {
+         const distanceFromBottom = el.scrollHeight - el.scrollTop - el.clientHeight
+         if (distanceFromBottom < NEAR_BOTTOM_THRESHOLD) {
+           requestAnimationFrame(() => {
+             try {
+               scrollToBottom(el)
+             } catch (e) {
+               // ignore
+             }
+           })
+         }
+       } catch (e) {
+         // ignore
+       }
+     })
+
+     observer.observe(el, { childList: true, subtree: true, characterData: true })
+     return () => observer.disconnect()
+   }, [scrollRef]);
+
   return (
     <Pane className="p-0">
       <div className="bg-white top-0 sticky p-2 px-4 pb-2 border-b text-sm flex items-center gap-1">
diff --git a/contrib/copilot-plugin/src/components/pane.tsx b/contrib/copilot-plugin/src/components/pane.tsx
index 34ca7e7c..53bc8dbc 100644
--- a/contrib/copilot-plugin/src/components/pane.tsx
+++ b/contrib/copilot-plugin/src/components/pane.tsx
@@ -1,19 +1,24 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT License.
 
+import React from "react";
 import { cn } from "../libs/utils";
 
 interface PaneProps extends React.HTMLAttributes<HTMLDivElement> {}
 
-export const Pane: React.FC<PaneProps> = ({ children, className }) => {
-  return (
-    <div
-      className={cn(
-        "bg-background flex-1 p-4 border-2 border-gray-300 rounded-md overflow-y-auto flex flex-col",
-        className
-      )}
-    >
-      {children}
-    </div>
-  );
-};
+export const Pane = React.forwardRef<HTMLDivElement, PaneProps>(
+  ({ children, className }, ref) => {
+    return (
+      <div
+        ref={ref}
+        className={cn(
+          "bg-background flex-1 p-4 border-2 border-gray-300 rounded-md overflow-y-auto flex flex-col",
+          className
+        )}
+      >
+        {children}
+      </div>
+    );
+  }
+);
+Pane.displayName = "Pane";
diff --git a/contrib/copilot-plugin/src/libs/state.ts b/contrib/copilot-plugin/src/libs/state.ts
index f5a0605e..ca061649 100644
--- a/contrib/copilot-plugin/src/libs/state.ts
+++ b/contrib/copilot-plugin/src/libs/state.ts
@@ -55,6 +55,9 @@ interface State {
   setAllModelsInCurrentJob: (models: string[]) => void;
   setCurrentModel: (model: string | null) => void;
   addChat: (chat: ChatMessage) => void;
+  appendToLastAssistant: (chunk: string) => void;
+  replaceLastAssistant: (text: string) => void;
+  setLastAssistantMessageInfo: (info: any) => void;
   
   // Conversation management actions
   generateNewConversationId: () => void;
@@ -98,7 +101,37 @@ export const useChatStore = create<State>((set) => ({
   setCurrentModel: (model) => set({ currentModel: model }),
 
   addChat: (log) => set((state) => ({ chatMsgs: [...state.chatMsgs, log] })),
-  
+  appendToLastAssistant: (chunk: string) => set((state) => {
+    const msgs = [...state.chatMsgs];
+    for (let i = msgs.length - 1; i >= 0; i--) {
+      if (msgs[i].role === 'assistant') {
+        msgs[i] = { ...msgs[i], message: (msgs[i].message || '') + chunk };
+        break;
+      }
+    }
+    return { chatMsgs: msgs };
+  },),
+  replaceLastAssistant: (text: string) => set((state) => {
+    const msgs = [...state.chatMsgs];
+    for (let i = msgs.length - 1; i >= 0; i--) {
+      if (msgs[i].role === 'assistant') {
+        msgs[i] = { ...msgs[i], message: text };
+        break;
+      }
+    }
+    return { chatMsgs: msgs };
+  }),
+  setLastAssistantMessageInfo: (info: any) => set((state) => {
+    const msgs = [...state.chatMsgs];
+    for (let i = msgs.length - 1; i >= 0; i--) {
+      if (msgs[i].role === 'assistant') {
+        msgs[i] = { ...msgs[i], messageInfo: info };
+        break;
+      }
+    }
+    return { chatMsgs: msgs };
+  }),
+
   // Generate a new conversation ID (useful for starting a new conversation)
   generateNewConversationId: () => set((state) => ({ 
     currentConversationId: uuidv4(),
diff --git a/src/copilot-chat/config/copilot-chat.yaml b/src/copilot-chat/config/copilot-chat.yaml
index 2adb9e0b..1675cb29 100644
--- a/src/copilot-chat/config/copilot-chat.yaml
+++ b/src/copilot-chat/config/copilot-chat.yaml
@@ -21,7 +21,7 @@ agent-host: "0.0.0.0"
 agent-port: "50000"
 secure-port: "8443"
 history-depth: "64"
-version: "f3"
+version: "f4"
 agent-mode: "remote"
 agent-mode-ca: "local"
 azure-openai-api-key: "<your-api-key>"
diff --git a/src/copilot-chat/src/copilot_agent/__main__.py b/src/copilot-chat/src/copilot_agent/__main__.py
index eed047a3..e3927657 100644
--- a/src/copilot-chat/src/copilot_agent/__main__.py
+++ b/src/copilot-chat/src/copilot_agent/__main__.py
@@ -4,13 +4,11 @@
 """Main module."""
 
 from .copilot_service import CoPilotService
-from .copilot_conversation import CoPilotConversation
 
 
 def main():
     """Main function."""
-    copilot_conversation = CoPilotConversation()
-    api = CoPilotService(copilot_conversation)
+    api = CoPilotService()
     api.run()
 
 
diff --git a/src/copilot-chat/src/copilot_agent/config.py b/src/copilot-chat/src/copilot_agent/config.py
index 823f2bb6..661260e7 100644
--- a/src/copilot-chat/src/copilot_agent/config.py
+++ b/src/copilot-chat/src/copilot_agent/config.py
@@ -16,7 +16,7 @@
 # set agent cross talk
 AGENT_PORT = int(os.getenv('AGENT_PORT', '50000'))
 AGENT_MODE_LOCAL = os.getenv('AGENT_MODE', '').lower() == 'local'
-AGENT_MODE_CA_LOCAL = os.getenv('AGENT_MODE_CA', '').lower() == 'local'
+AGENT_MINIMAL_ON = os.getenv('AGENT_MINIMAL', '').lower() == 'on'
 #
 TO_CONTROLLER = True
 #
@@ -31,4 +31,3 @@ def print_env_variables():
     logger.info(f"Env Var: COPILOT_VERSION: {os.getenv('COPILOT_VERSION', 'na')}")
     logger.info(f"Env Var: AGENT_PORT: {os.getenv('AGENT_PORT', '50000')}")
     logger.info(f"Env Var: AGENT_MODE: {os.getenv('AGENT_MODE', 'na')}")
-    logger.info(f"Env Var: AGENT_MODE_CA: {os.getenv('AGENT_MODE_CA', 'na')}")
diff --git a/src/copilot-chat/src/copilot_agent/copilot_conversation.py b/src/copilot-chat/src/copilot_agent/copilot_conversation.py
index 00ac213a..ded85627 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_conversation.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_conversation.py
@@ -5,7 +5,6 @@
 
 import os
 from collections import deque
-import uuid
 import pandas as pd
 from datetime import datetime, timezone
 from typing import Union
@@ -15,8 +14,9 @@
 from .utils.authentication import AuthenticationManager
 from .utils.kql_executor import KustoExecutor
 
-from .config import AGENT_MODE_LOCAL, print_env_variables
+from .config import AGENT_MINIMAL_ON, print_env_variables
 from .copilot_turn import CoPilotTurn
+from .utils.llmsession import LLMSession
 
 HISTORY_DEPTH = int(os.getenv('COPILOT_HISTORY_DEPTH', 64))
 if HISTORY_DEPTH <= 0:
@@ -60,12 +60,13 @@ def __init__(self, response: dict) -> None:
 # --- New CoPilot class (business logic only) ---
 class CoPilotConversation:
     """CoPilot Conversation, manages the inquiry/response turns for each user."""
-    def __init__(self):
+    def __init__(self, llm_session: LLMSession) -> None:
         """Initialize CoPilotConversation, message history, and authentication manager."""
         print_env_variables()
-        self.copilot = CoPilotTurn(verbose=False)
+        self.copilot_turn = CoPilotTurn(llm_session=llm_session, verbose=False)
         self.msg_dict = {}  # Dictionary to store message deques per user
         self.auth_manager = AuthenticationManager()
+        self.llm_session = llm_session
 
     def manage_conv_history(self, user_id: str, message: dict) -> None:
         """Append a message to the user's message history."""
@@ -95,23 +96,23 @@ def perform_operation(self, in_parameters: InParameters) -> OutParameters:
 
         # process
         if _is_feedback:
-            user_id, conv_id = self._extract_user_and_conv_id(question_msg_info)
-            result = self._handle_feedback_only(user_id, conv_id)
+            user_id, conv_id, turn_id = self._extract_user_and_conv_id(question_msg_info)
+            result = self._handle_feedback_only(user_id, conv_id, turn_id)
         elif _is_question:
-            user_id, conv_id = self._extract_user_and_conv_id(question_msg_info)
+            user_id, conv_id, turn_id = self._extract_user_and_conv_id(question_msg_info)
             # Authenticate only for user question
             if not self.auth_manager.is_authenticated(username):
                 logger.info(f'User {username} not authenticated, attempting to authenticate...')
                 self.auth_manager.set_authenticate_state(username, rest_token)
                 if not self.auth_manager.is_authenticated(username):
                     logger.error(f'User {username} failed authentication twice. Aborting operation.')
-                    result = self._handle_authenticate_failure(user_id, conv_id)
+                    result = self._handle_authenticate_failure(user_id, conv_id, turn_id)
                 else:
                     logger.info(f'User {username} authenticated successfully.')
-                    result = self._handle_user_question(user_id, conv_id, user_prompt, skip_summary, debugging, question_msg_info)
+                    result = self._handle_user_question(user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info)
             else:
                 logger.info(f'User {username} authenticated successfully.')
-                result = self._handle_user_question(user_id, conv_id, user_prompt, skip_summary, debugging, question_msg_info)
+                result = self._handle_user_question(user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info)
         else:
             result = self._handle_empty_input()
 
@@ -134,10 +135,12 @@ def _extract_user_and_conv_id(self, question_msg_info):
         if question_msg_info is not None:
             user_id = question_msg_info.get('userId', 'unknown')
             conv_id = question_msg_info.get('convId', 'na')
+            turn_id = question_msg_info.get('turnId', 'na')
         else:
             user_id = 'unknown'
             conv_id = 'na'
-        return user_id, conv_id
+            turn_id = 'na'
+        return user_id, conv_id, turn_id
 
     def _is_feedback_only(self, user_feedback, user_prompt):
         """Return True if only feedback is provided, not a user question."""
@@ -147,10 +150,10 @@ def _is_user_question_only(self, user_feedback, user_prompt):
         """Return True if only a user question is provided, not feedback."""
         return not user_feedback and user_prompt
 
-    def _handle_feedback_only(self, user_id, conv_id):
+    def _handle_feedback_only(self, user_id, conv_id, turn_id):
         """Handle the case where only feedback is provided."""
         logger.info('User feedback provided without a user question. No operation is required.')
-        resp = self._make_skip_response(user_id, conv_id, 'feedback_ack')
+        resp = self._make_skip_response(user_id, conv_id, turn_id, 'feedback_ack')
         out_parameters = OutParameters(resp)
         return out_parameters
 
@@ -161,28 +164,36 @@ def _handle_empty_input(self):
         out_parameters = OutParameters(resp)
         return out_parameters
 
-    def _handle_authenticate_failure(self, user_id, conv_id):
+    def _handle_authenticate_failure(self, user_id, conv_id, turn_id):
         """Handle authentication failure case."""
         logger.info('User authentication failed. Aborting operation.')
-        resp = self._make_skip_response(user_id, conv_id, 'error')
+        resp = self._make_skip_response(user_id, conv_id, turn_id, 'error')
         out_parameters = OutParameters(resp)
+
+        # If LLM session is available, set up thread-local context and use push_frontend_event
+        if self.llm_session:
+            from .utils.push_frontend import set_thread_llm_session, push_frontend_event
+            set_thread_llm_session(self.llm_session)
+            error_message = '<span class="text-gray-400 italic">Unauthorized - Authentication failed</span>'
+            push_frontend_event(error_message)
+        
         return out_parameters
 
-    def _handle_user_question(self, user_id, conv_id, user_prompt, skip_summary, debugging, question_msg_info):
+    def _handle_user_question(self, user_id, conv_id, turn_id, user_prompt, skip_summary, debugging, question_msg_info):
         """Handle the case where only a user question is provided."""
         if user_id not in self.msg_dict:
             self.msg_dict[user_id] = deque(maxlen=HISTORY_DEPTH)
         msg_user = {'role': 'user', 'content': user_prompt}
         self.manage_conv_history(user_id, msg_user)
         logger.info(f'[internal control word] [per user check] user "{user_id}" msg_list length is {len(self.msg_dict[user_id])}')
-        resp = self.copilot.process_turn(self.msg_dict[user_id], skip_summary, debugging)
+        resp = self.copilot_turn.process_turn(self.msg_dict[user_id], skip_summary, debugging)
         if not isinstance(resp, dict):
             logger.info('Unexpected response format from copilot.process_turn')
-            return self.handle_unexpected_copilot_response(user_id, conv_id)
+            return self.handle_unexpected_copilot_response(user_id, conv_id, turn_id)
         response_message_info = {
             'userId': user_id,
             'convId': conv_id,
-            'turnId': str(uuid.uuid4()),
+            'turnId': turn_id,
             'timestamp': int(datetime.now(timezone.utc).timestamp() * 1000),
             'type': 'answer',
             'timestampUnit': 'ms',
@@ -248,17 +259,18 @@ def _log_message_data(self, inout: str, parameters: Union[InParameters, OutParam
             'Debug': debug,
         }
         logger.info(f'[copilot data collection] {log_data}')
-        # ingest kusto table
-        self.collect_data_to_kusto(log_data)
+        if not AGENT_MINIMAL_ON:
+            # ingest kusto table
+            self.collect_data_to_kusto(log_data)
 
-    def handle_unexpected_copilot_response(self, user_id: str, conv_id: str) -> OutParameters:
+    def handle_unexpected_copilot_response(self, user_id: str, conv_id: str, turn_id: str) -> OutParameters:
         """Handle unexpected response format from copilot agent and log error."""
         error_resp = {
             'answer': 'Internal error: unexpected response format from copilot agent.',
             'messageInfo': {
                 'userId': user_id,
                 'convId': conv_id,
-                'turnId': str(uuid.uuid4()),
+                'turnId': turn_id,
                 'timestamp': int(datetime.now(timezone.utc).timestamp() * 1000),
                 'type': 'error',
                 'timestampUnit': 'ms',
@@ -293,14 +305,14 @@ def collect_data_to_kusto(self, log_data: dict):
             logger.error(f"Exception during Kusto analytics collection: {e}", exc_info=True)
 
     @staticmethod
-    def _make_skip_response(user_id, conv_id, type_str):
+    def _make_skip_response(user_id, conv_id, turn_id, type_str):
         """Create a standard skip/error response struct for feedback or authentication failure."""
         return {
             'answer': 'skip',
             'messageInfo': {
                 'userId': user_id,
                 'convId': conv_id,
-                'turnId': str(uuid.uuid4()),
+                'turnId': turn_id,
                 'timestamp': int(datetime.now(timezone.utc).timestamp() * 1000),
                 'type': type_str,
                 'timestampUnit': 'ms',
diff --git a/src/copilot-chat/src/copilot_agent/copilot_service.py b/src/copilot-chat/src/copilot_agent/copilot_service.py
index 5eecd58b..3544ba98 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_service.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_service.py
@@ -7,10 +7,14 @@
 from flask import Flask, jsonify, request
 from flask_cors import CORS
 import threading
+from flask import Response, stream_with_context
+import json
+import queue
 
 from .copilot_conversation import CoPilotConversation
 
 from .utils.logger import logger
+from .utils.llmsession import LLMSession
 
 from .config import AGENT_PORT, AGENT_MODE_LOCAL
 
@@ -18,18 +22,19 @@
 # --- New CoPilotAPI class (Flask app setup and endpoints) ---
 class CoPilotService:
     """Flask app and endpoint manager for CoPilot."""
-    def __init__(self, copilot_conversation: CoPilotConversation):
+    def __init__(self):
         """
         Initialize the CoPilotAPI with a CoPilot instance, set up Flask app and endpoints.
 
         Args:
             copilot: Instance of CoPilot business logic class.
         """
-        self.copilot_conversation = copilot_conversation
+        self.sessions = {}
         self.host = os.getenv('AGENT_HOST', '127.0.0.1')
         self.app = Flask(__name__)
         self.app.add_url_rule('/copilot/api/status', view_func=self.status, methods=['GET'])
         self.app.add_url_rule('/copilot/api/operation', view_func=self.instance_operation, methods=['POST'])
+        self.app.add_url_rule('/copilot/api/stream', view_func=self.stream_operation, methods=['POST'])
 
         # If running in local agent mode, enable CORS to allow local testing from dev frontends.
         if AGENT_MODE_LOCAL:
@@ -43,13 +48,29 @@ def status(self):
         """GET endpoint for health/status check."""
         return jsonify({"status": "running"})
 
+    def get_or_create_session(self, user_id, conv_id):
+        """Retrieve or create a copilot_conversation for the given userId and convId, reusing its LLMSession.
+
+        A new LLMSession is created ONLY when the conversation is first seen; subsequent requests reuse
+        the existing session to avoid repeated client/session setup overhead. This helps reduce per-request
+        latency (~hundreds of ms) previously incurred by constructing new OpenAI/Azure clients.
+        """
+        session_key = f"{user_id}_{conv_id}"
+        if session_key not in self.sessions:
+            self.sessions[session_key] = CoPilotConversation(LLMSession())
+        return self.sessions[session_key]
+
     def instance_operation(self):
         """POST endpoint to handle copilot operations."""
         logger.info("Received request at /copilot/api/operation")
         try:
             data = request.get_json()
-            in_parameters = self.copilot_conversation.build_in_parameters(data)
-            out_parameters = self.copilot_conversation.perform_operation(in_parameters)
+            user_id = data['data']['messageInfo']['userId']
+            conv_id = data['data']['messageInfo']['convId']
+            copilot_conversation = self.get_or_create_session(user_id, conv_id)
+            
+            in_parameters = copilot_conversation.build_in_parameters(data)
+            out_parameters = copilot_conversation.perform_operation(in_parameters)
             response = {
                 "status": "success",
                 "data": out_parameters.__dict__
@@ -59,6 +80,83 @@ def instance_operation(self):
             logger.info(f"Error handling copilot/api/operation: {e}")
             return jsonify({"status": "error", "message": str(e)}), 500
 
+    def stream_operation(self):
+        """POST endpoint to stream operation output as chunked text (SSE-like).
+
+        The endpoint accepts the same JSON payload as the normal operation endpoint.
+        It sets a module-level callback in the summary module so streaming chunks are
+        forwarded to the HTTP response. This avoids changing many internal call chains.
+        """
+        logger.info("Received request at /copilot/api/stream")
+
+        # Create queue BEFORE the callback function
+        q = queue.Queue()
+
+        def on_chunk(chunk: str):
+            # put chunk into queue for streaming response
+            q.put(chunk)
+        
+        try:
+            data = request.get_json()
+            user_id = data['data']['messageInfo']['userId']
+            conv_id = data['data']['messageInfo']['convId']
+            copilot_conversation = self.get_or_create_session(user_id, conv_id)
+            llm_session = copilot_conversation.llm_session
+            # Attach streaming callback to existing session (no new session creation)
+            llm_session.set_instance_stream_callback(on_chunk)
+        except KeyError as e:
+            logger.error(f"Missing key in JSON body for stream_operation: {e}")
+            return jsonify({"status": "error", "message": f"Missing key: {e}"}), 400
+        except Exception as e:
+            logger.error(f"Failed to parse JSON body for stream_operation: {e}")
+            return jsonify({"status": "error", "message": "invalid json"}), 400
+
+
+
+        def worker():
+            # Use the llm_session from the copilot_conversation
+            try:
+                in_parameters = copilot_conversation.build_in_parameters(data)
+                # Reuse the llm_session passed to the conversation
+                copilot_conversation.perform_operation(in_parameters)
+            except Exception as e:
+                logger.error(f"Error during streaming operation worker: {e}")
+                import traceback
+                logger.error(traceback.format_exc())
+                q.put(json.dumps({'error': str(e)}))
+            finally:
+                # Clear streaming callback to avoid affecting subsequent non-stream requests
+                try:
+                    llm_session.clear_instance_stream_callback()
+                except Exception:
+                    logger.debug('Failed to clear instance stream callback')
+                # signal end of stream
+                q.put(None)
+
+        def event_stream():
+            # start worker thread
+            t = threading.Thread(target=worker, daemon=True)
+            t.start()
+            # yield chunks as they arrive
+            while True:
+                item = q.get()
+                if item is None:
+                    break
+                # SSE-style framing: prefix each line with 'data:' to support multi-line payloads
+                text = str(item)
+                lines = text.splitlines()
+                if len(lines) == 0:
+                    yield "data: \n\n"
+                else:
+                    for ln in lines:
+                        yield f"data: {ln}\n"
+                    # event delimiter
+                    yield "\n"
+            # final event
+            yield "event: done\n\n"
+
+        return Response(stream_with_context(event_stream()), mimetype='text/event-stream')
+
     def run_http_server(self):
         """Start the Flask HTTP server."""
         port = AGENT_PORT
diff --git a/src/copilot-chat/src/copilot_agent/copilot_turn.py b/src/copilot-chat/src/copilot_agent/copilot_turn.py
index 2d0d9ada..98d09616 100644
--- a/src/copilot-chat/src/copilot_agent/copilot_turn.py
+++ b/src/copilot-chat/src/copilot_agent/copilot_turn.py
@@ -9,82 +9,107 @@
 from .utils.logger import logger
 
 from .config import COPILOT_VERSION, DATA_DIR, PROMPT_DIR
-from .ltp import ltp_auto_reject, ltp_human_intervention, query_metadata, query_metrics, query_user_manual, query_powerbi
+from .ltp import LTP
 from .utils import (
+    Contextualizer,
     LLMSession,
     LTPReportProcessor,
     QuestionClassifier,
-    contextualize_question,
-    gen_smart_help,
     get_prompt_from,
+    push_frontend_event,
+    set_thread_llm_session,
+    SmartHelp
 )
 
 
 class CoPilotTurn:
     """CoPilot Turn, handles each inquiry/response turn."""
 
-    def __init__(self, model: LLMSession = None, verbose: bool = False) -> None:
+    def __init__(self, llm_session: LLMSession, verbose: bool = False) -> None:
         """Initialize."""
-        if model is None:
-            model = LLMSession()
-        self.model = model
         self.verbose = verbose
+        self.llm_session = llm_session
         # Initialize version
         self._version = self._initialize_version()
         # Load help message
         self.help_msg = self.load_help_message()
-        # Question Classifier
-        self.classifier = QuestionClassifier(self._version, self.model)
+        self.system_prompt_answer_general = get_prompt_from(os.path.join(PROMPT_DIR, 'gen_smart_help_prompt_general.txt'))
+        self.classifier = QuestionClassifier(self._version, self.llm_session)
+        self.contextualizer = Contextualizer(self.llm_session)
+        self.processor = LTP(self.llm_session)
+        self.smart_help = SmartHelp(self.help_msg, self.llm_session)
 
     # entry function, processes the list of messages and returns a dictionary with the results
     def process_turn(self, messages_list: list, skip_summary: bool = False, debugging: bool = False) -> dict:
         """Process the list of messages and return a dictionary with the results."""
-        if debugging:
-            logger.info(f'DEBUGGING: {debugging}')
-            return {'category': None, 'answer': 'DEBUGGING MODE ENABLED', 'debug': {'debugging': debugging}}
 
-        # get contextualized question from this and last user inquiry
+        # Set thread-local session for push_frontend functions to use correct callback
+        set_thread_llm_session(self.llm_session)
+
+        # get from message list
+
         this_inquiry = messages_list[-1]['content']
         last_inquiry = messages_list[-3]['content'] if len(messages_list) > 2 else None
-        question = contextualize_question(this_inquiry, last_inquiry)
 
-        # classify the question to determine the solution source and method
-        question_type = self.classifier.classify_question(question)
-        # objective, concern in the question
-        obj, con = question_type.get('lv0_object', '3. [general]'), question_type.get('lv1_concern', '0. [others]')
-
-        # verion f3, resolves objective 8 (Lucia Training Platform)
-        if self._version == 'f3':
-            if obj.count('8') > 0:
+        # debug only
+        
+        if self._version == 'f4':
+            push_frontend_event('<span class="text-gray-400 italic">🧠 Copilot is understanding your request...</span><br/>', replace=False)
+            question_type = self.classifier.parse_question(this_inquiry, last_inquiry)
+            question = question_type.get('new_question', this_inquiry)
+            obj = question_type.get('lv0_object', '3. [general]')
+            con = question_type.get('lv1_concern', '0. [others]')
+        else:
+            # get contextualized question from this and last user inquiry
+            push_frontend_event('<span class="text-gray-400 italic">🤔 Copilot is understanding your request...</span><br/>', replace=False)
+            question = self.contextualizer.contextualize(this_inquiry, last_inquiry)
+            # classify the question to determine the solution source and method
+            push_frontend_event('<span class="text-gray-400 italic">🔍 Copilot is finding the right the data source...</span><br/>', replace=False)
+            question_type = self.classifier.classify_question(question)
+            obj, con = question_type.get('lv0_object', '3. [general]'), question_type.get('lv1_concern', '0. [others]')
+
+        # verion f3, f4, resolves objective 8 (Lucia Training Platform)
+        push_frontend_event('<span class="text-gray-400 italic">⏳ Copilot is processing your inquiry...</span><br/>', replace=False)
+        self.smart_help.llm_session = self.llm_session  # ensure processor uses the current llm_session
+        if self._version in ['f3', 'f4']:
+            # If classification failed, treat as unsupported.
+            if obj is None or con is None:
+                help_keys = ['unsupported_question']
+                answer = self.smart_help.generate(question, help_keys, True)
+                debug = {}
+            elif obj.count('8') > 0:
                 answer, debug = self.query_ltp(question, con, skip_summary)
             elif obj.count('3') > 0:
-                answer = self.gen_smart_help_general(question)
+                answer = self.gen_answer_general(question)
                 debug = {}
             elif obj.count('9') > 0:
                 help_keys = ['feature']
-                answer = gen_smart_help(self.help_msg, question, help_keys)
+                answer = self.smart_help.generate(question, help_keys, True)
                 debug = {}
             else:
                 help_keys = ['unsupported_question']
-                answer = gen_smart_help(self.help_msg, question, help_keys)
+                answer = self.smart_help.generate(question, help_keys, True)
                 debug = {}
         else:
             # Placeholder for other version implementations
             help_keys = ['unsupported_question']
-            answer = gen_smart_help(self.help_msg, question, help_keys)
+            answer = self.smart_help.generate(question, help_keys, True)
             debug = {}
+        
         return {'category': question_type, 'answer': answer, 'debug': debug}
 
     def query_ltp(self, question: str, con: str, skip_summary: bool) -> tuple[str, dict]:
         """Query about Lucia Training Platform."""
-        # Mapping concern codes to handler functions
+        self.processor.llm_session = self.llm_session  # ensure processor uses the current llm_session
+        # Mapping concern codes to handler functions  
+        # Updated to pass llm_session to prevent singleton blocking
         handlers = {
-            '1': lambda: query_metrics(question, self.help_msg, skip_summary),
-            '2': lambda: query_metadata(question, self.help_msg, skip_summary),
-            '3': lambda: query_user_manual(question, self.help_msg),
-            '4': lambda: query_powerbi(question, self.help_msg),
-            '5': lambda: ltp_auto_reject(question, self.help_msg),
-            '6': lambda: ltp_human_intervention(question, self.help_msg),
+            '1': lambda: self.processor.query_metrics(question, self.help_msg, skip_summary),
+            '2': lambda: self.processor.query_metadata(question, self.help_msg, skip_summary),
+            '3': lambda: self.processor.query_user_manual(question, self.help_msg),
+            '4': lambda: self.processor.query_powerbi(question, self.help_msg),
+            '5': lambda: self.processor.auto_reject(question, self.help_msg),
+            '6': lambda: self.processor.human_intervention(question, self.help_msg),
         }
         for code, handler in handlers.items():
             if con.count(code) > 0:
@@ -92,12 +117,13 @@ def query_ltp(self, question: str, con: str, skip_summary: bool) -> tuple[str, d
         return 'unsupported concern.', {}
 
     # generate generic smart help message based on user input
-    def gen_smart_help_general(self, question: str) -> str:
+    def gen_answer_general(self, question: str) -> str:
         """Generate smart help message based on user input."""
-        system_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'gen_smart_help_prompt_general.txt'))
+        system_prompt = self.system_prompt_answer_general
         if isinstance(self.help_msg, dict) and 'feature' in self.help_msg:
             system_prompt = system_prompt + '\n\n' + self.help_msg['feature']
-        summary = self.model.chat(system_prompt, f'question is: {question}')
+        push_frontend_event('<span class="text-gray-400 italic">🌐 Accessing public information...</span><br/>', replace=False)
+        summary = self.llm_session.try_stream_fallback_chat(system_prompt, f'question is: {question}')
         return summary
 
     def get_preload_dashboard(self):
@@ -119,7 +145,7 @@ def get_preload_dashboard(self):
 
     def load_help_message(self) -> dict:
         """Load the help message based on the version."""
-        help_doc_path = os.path.join(PROMPT_DIR, 'help', f'infrawise_help_{self._version}.json')
+        help_doc_path = os.path.join(PROMPT_DIR, 'help', f'infrawise_help.json')
         if not os.path.exists(help_doc_path):
             logger.error(f'Help doc not found: {help_doc_path}')
             raise FileNotFoundError(f'Help doc not found: {help_doc_path}')
@@ -130,7 +156,7 @@ def load_help_message(self) -> dict:
 
     def _initialize_version(self) -> str:
         """Determine and set the version."""
-        allowed_versions = {'f2', 'f3', 'f0f1'}
+        allowed_versions = {'f2', 'f3', 'f0f1', 'f4'}
         version = COPILOT_VERSION if COPILOT_VERSION in allowed_versions else 'f3'
         logger.info(f'CoPilot - ver:{version}')
         return version
diff --git a/src/copilot-chat/src/copilot_agent/ltp/__init__.py b/src/copilot-chat/src/copilot_agent/ltp/__init__.py
index 848910cc..52d55d16 100644
--- a/src/copilot-chat/src/copilot_agent/ltp/__init__.py
+++ b/src/copilot-chat/src/copilot_agent/ltp/__init__.py
@@ -3,13 +3,8 @@
 
 """Init file for LTP module."""
 
-from .ltp import ltp_auto_reject, ltp_human_intervention, query_metadata, query_metrics, query_user_manual, query_powerbi
+from .ltp import LTP
 
 __all__ = [
-    'ltp_auto_reject',
-    'ltp_human_intervention',
-    'query_metadata',
-    'query_metrics',
-    'query_user_manual',
-    'query_powerbi',
+    'LTP'
 ]
diff --git a/src/copilot-chat/src/copilot_agent/ltp/ltp.py b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
index 0805d731..e03000f3 100644
--- a/src/copilot-chat/src/copilot_agent/ltp/ltp.py
+++ b/src/copilot-chat/src/copilot_agent/ltp/ltp.py
@@ -16,6 +16,7 @@
 from ..utils.openpai import execute_openpai_query
 from ..utils.powerbi import LTPReportProcessor
 from ..utils.promql import execute_promql_query, execute_promql_query_step, retrive_promql_response_value
+from ..utils.push_frontend import push_frontend_event
 from ..utils.query import gen_promql_query
 from ..utils.summary import gen_summary
 from ..utils.time import get_current_unix_timestamp
@@ -23,241 +24,253 @@
 from ..utils.utils import get_prompt_from
 from .ltp_dashboard import query_generation_kql
 
-model = LLMSession()
 
-SUB_FEATURE = 'ltp'
-
-SKIP_LUCIA_CONTROLLER_EXECUTION = True
+class LTP:
+    """LTP Query Engine for handling various LTP-related queries."""
+
+    SUB_FEATURE = 'ltp'
+    SKIP_LUCIA_CONTROLLER_EXECUTION = True
+
+    def __init__(self, llm_session: LLMSession = None) -> None:
+        """
+        Initialize LTP Query Engine.
+        
+        Args:
+            llm_session: LLM session for generating queries and summaries
+        """
+        self.llm_session = llm_session
+        self.feature_skipped = True
+        self.ltp_documentation = get_prompt_from(os.path.join(PROMPT_DIR, self.SUB_FEATURE, 'ltp_documentation.txt'))
+
+    def query_metrics(self, question: str, help_msg, skip_summary: bool = False):
+        """Query cluster or job metrics from Prometheus backend."""
+        
+        if self.feature_skipped:
+            logger.info('Skipping PromQL query generation.')
+            query, end_time_stamp, parallel, param = None, None, None, {'scrape_interval': None, 'time_offset': None}
+            logger.info('Skipping PromQL query execution.')
+            resp = f'Skipping PromQL query execution due to lack of support, this will be enabled in the next release.'
+            resp_parser_param = resp
+            value = {'result': resp}
+        else:
+            # generate query
+            logger.info('Generating Query: LTP, Metrics')
+            query, end_time_stamp, parallel, param = gen_promql_query(self.SUB_FEATURE, question, self.llm_session)
+
+            if not query:
+                logger.info(f'No query found in the response, query is {query}')
+                answer = 'Query generation failed, no query found.'
+                return answer, None
+            # execute
+            logger.info('Executing Query: LTP, Metrics')
+            if not parallel:
+                resp = execute_promql_query(query, {})
+            else:
+                resp = execute_promql_query_step(query, {}, end_time_stamp, f"{param['time_offset']}")
+            resp_parser_param, value = retrive_promql_response_value(resp, param)
+
+        # generate answer
+        logger.info('Generating Answer: LTP, Metrics')
+        summary = gen_summary(
+            self.SUB_FEATURE,
+            {'cluster_job_metrics': value},
+            {'cluster_job_metrics': value},
+            question,
+            'gen_result_summary_metrics.txt',
+            None,
+            help_msg,
+            skip_summary,
+            self.llm_session
+        )
 
-# session: query cluster or job metrics from Prometheus REST API
-def query_metrics(question: str, help_msg, skip_summary: bool = False):
-    """Query cluster or job metrics from Prometheus backend."""
-    
-    if SKIP_LUCIA_CONTROLLER_EXECUTION:
-        logger.info('Skipping PromQL query generation.')
-        query, end_time_stamp, parallel, param = None, None, None, {'scrape_interval': None, 'time_offset': None}
-        logger.info('Skipping PromQL query execution.')
-        resp = f'Skipping PromQL query execution due to lack of support, this will be enabled in the next release.'
-        resp_parser_param = resp
-        value = {'result': resp}
-    else:
+        # generate additional info dict
+        info_dict = {}
+        info_dict['query_promql'] = query
+        info_dict['query_param_promql'] = param
+        info_dict['resp_promql'] = resp
+        info_dict['value_promql'] = value
+        info_dict['__debug:f1_query'] = query
+        info_dict['__debug:f2_execution_method'] = parallel
+        info_dict['__debug:f3_scrape_interval_in_execution'] = param.get('scrape_interval', None)
+        info_dict['__debug:f4_resp_parser_param'] = resp_parser_param
+        info_dict['__debug:r0_type_resp_promql'] = type(resp).__name__
+
+        return summary, info_dict
+
+    def query_metadata(self, question: str, help_msg, skip_summary: bool = False):
+        """Query job metadata from OpenPAI backend."""
         # generate query
-        logger.info('Generating Query: LTP, Metrics')
-        query, end_time_stamp, parallel, param = gen_promql_query(SUB_FEATURE, question)
-
-        if not query:
-            logger.info(f'No query found in the response, query is {query}')
-            answer = 'Query generation failed, no query found.'
-            return answer, None
-        # execute
-        logger.info('Executing Query: LTP, Metrics')
-        if not parallel:
-            resp = execute_promql_query(query, {})
+        logger.info('Generating Query: LTP, Metadata')
+        query = 'restserver/jobs?offset=0&limit=49999&withTotalCount=true&order=completionTime'
+
+        if self.feature_skipped:
+            logger.info('Skipping job metadata query execution.')
+            resp = f'Skipping job metadata query execution due to lack of support, this will be enabled in the next release.'
+            job_metadata = {'result': resp}
+            job_metadata_brief = {'result': resp}
         else:
-            resp = execute_promql_query_step(query, {}, end_time_stamp, f"{param['time_offset']}")
-        resp_parser_param, value = retrive_promql_response_value(resp, param)
-
-    # generate answer
-    logger.info('Generating Answer: LTP, Metrics')
-    summary = gen_summary(
-        SUB_FEATURE,
-        {'cluster_job_metrics': value},
-        {'cluster_job_metrics': value},
-        question,
-        'gen_result_summary_metrics.txt',
-        None,
-        help_msg,
-        skip_summary,
-    )
-
-    # generate additional info dict
-    info_dict = {}
-    info_dict['query_promql'] = query
-    info_dict['query_param_promql'] = param
-    info_dict['resp_promql'] = resp
-    info_dict['value_promql'] = value
-    info_dict['__debug:f1_query'] = query
-    info_dict['__debug:f2_execution_method'] = parallel
-    info_dict['__debug:f3_scrape_interval_in_execution'] = param.get('scrape_interval', None)
-    info_dict['__debug:f4_resp_parser_param'] = resp_parser_param
-    info_dict['__debug:r0_type_resp_promql'] = type(resp).__name__
-
-    return summary, info_dict
-
-
-# session: query job metadata from OpenPAI backend
-def query_metadata(question: str, help_msg, skip_summary: bool = False):
-    """Query job metadata from OpenPAI backend."""
-    # generate query
-    logger.info('Generating Query: LTP, Metadata')
-    query = 'restserver/jobs?offset=0&limit=49999&withTotalCount=true&order=completionTime'
-
-    if SKIP_LUCIA_CONTROLLER_EXECUTION:
-        logger.info('Skipping job metadata query execution.')
-        resp = f'Skipping job metadata query execution due to lack of support, this will be enabled in the next release.'
-        job_metadata = {'result': resp}
-        job_metadata_brief = {'result': resp}
-    else:
-        # extract
-        logger.info('Executing Query: LTP, Metadata')
-        resp = execute_openpai_query(query, {})
-        job_metadata = extract_job_metadata(resp)
-        job_metadata_brief = get_brief_job_metadata(resp)
-
-    # generate answer
-    logger.info('Generating Answer: LTP, Metadata')
-    summary = gen_summary(
-        SUB_FEATURE,
-        {'job_metadata': job_metadata},
-        {'job_metadata_only_the_first_1000_jobs': job_metadata_brief},
-        question,
-        'gen_result_summary_metadata.txt',
-        None,
-        help_msg,
-        skip_summary,
-    )
-
-    # generate additional info dict
-    info_dict = {}
-    info_dict['query_promql'] = query
-    info_dict['resp_brief_promql'] = job_metadata_brief
-    return summary, info_dict
-
-
-# session: query user manual from LTP documentation
-def query_user_manual(question: str, help_msg):
-    """Query user manual."""
-    # read documentation
-    documentation = get_prompt_from(os.path.join(PROMPT_DIR, SUB_FEATURE, 'ltp_documentation.txt'))
-    ltp_doc = {'lucia training platform documentation': documentation}
-
-    # generate answer
-    logger.info('Generating Answer: LTP, User Manual')
-    summary = gen_summary(SUB_FEATURE, ltp_doc, None, question, 'gen_result_summary_doc.txt', None, help_msg)
-
-    info_dict = {}
-    return summary, info_dict
-
-
-def extract_job_metadata(resp):
-    """Extract job metadata from OpenPAI response."""
-    if isinstance(resp, dict) and 'data' in resp:
-        resp_data = resp['data']
-        job_metadatas = {
-            f'{job["username"]}~{job["name"]}': {
-                k: v for k, v in job.items() if k not in ['debugId', 'subState', 'executionType', 'appExitCode']
-            }
-            for job in resp_data
-        }
-    else:
-        job_metadatas = None
-    return job_metadatas
-
-
-def get_brief_job_metadata(resp):
-    """Get brief job metadata."""
-    if isinstance(resp, dict) and 'data' in resp:
-        resp_data = resp['data']
-        job_metadatas = [f'{job["username"]}~{job["name"]}' for job in resp_data]
-        job_metadatas = job_metadatas[:1000]
-    else:
-        job_metadatas = None
-    return job_metadatas
-
-
-def query_powerbi(question: str, help_msg):
-    """Query PowerBI data."""
-
-    query_gen_res, query_gen_status = query_generation_kql(question)
-    logger.info(f'KQL Query generation result: {query_gen_res}, status: {query_gen_status}')
-    k_cluster = os.environ.get('DATA_SRC_KUSTO_CLUSTER_URL', '')
-    k_db = os.environ.get('DATA_SRC_KUSTO_DATABASE_NAME', '')
-    k_table = ''
-    if query_gen_status == 0:
-        KQL = KustoExecutor(k_cluster, k_db, k_table)
-        # Replace placeholders
-        query_gen_res = query_gen_res.format(
-            cluster_url=k_cluster,
-            database_name=k_db
+            # extract
+            logger.info('Executing Query: LTP, Metadata')
+            resp = execute_openpai_query(query, {})
+            job_metadata = self._extract_job_metadata(resp)
+            job_metadata_brief = self._get_brief_job_metadata(resp)
+
+        # generate answer
+        logger.info('Generating Answer: LTP, Metadata')
+        summary = gen_summary(
+            self.SUB_FEATURE,
+            {'job_metadata': job_metadata},
+            {'job_metadata_only_the_first_1000_jobs': job_metadata_brief},
+            question,
+            'gen_result_summary_metadata.txt',
+            None,
+            help_msg,
+            skip_summary,
+            self.llm_session
+        )
+
+        # generate additional info dict
+        info_dict = {}
+        info_dict['query_promql'] = query
+        info_dict['resp_brief_promql'] = job_metadata_brief
+        return summary, info_dict
+
+    def query_user_manual(self, question: str, help_msg):
+        """Query user manual."""
+        ltp_doc = {'lucia training platform documentation': self.ltp_documentation}
+
+        # generate answer
+        logger.info('Generating Answer: LTP, User Manual')
+        summary = gen_summary(self.SUB_FEATURE, ltp_doc, None, question, 'gen_result_summary_doc.txt', None, help_msg, llm_session=self.llm_session)
+
+        info_dict = {}
+        return summary, info_dict
+
+    def query_powerbi(self, question: str, help_msg):
+        """Query PowerBI data."""
+
+        # send HTML snippet so frontend (with rehype-raw enabled) can render with Tailwind styling
+        push_frontend_event('<span class="text-gray-400 italic">✍️ Copilot is crafting the query...</span><br/>', replace=False)
+        query_gen_res, query_gen_status = query_generation_kql(question)
+        logger.info(f'KQL Query generation result: {query_gen_res}, status: {query_gen_status}')
+        k_cluster = os.environ.get('DATA_SRC_KUSTO_CLUSTER_URL', '')
+        k_db = os.environ.get('DATA_SRC_KUSTO_DATABASE_NAME', '')
+        k_table = ''
+        if query_gen_status == 0:
+            push_frontend_event('<span class="text-gray-400 italic">📥 Copilot is collecting the information...</span><br/>', replace=False)
+            KQL = KustoExecutor(k_cluster, k_db, k_table)
+            # Replace placeholders
+            query_gen_res = query_gen_res.format(
+                cluster_url=k_cluster,
+                database_name=k_db
+            )
+            response, response_status = KQL.execute_return_data(query_gen_res)
+            response_long = {"query_generated": query_gen_res, "response_from_query_execution": response}
+            logger.info(f'Kusto Query execution result: {response}')
+        else:
+            response = {}
+            response_long = {"query_generated": "query generation failed, please perform manual investigation", "response_from_query_execution": response}
+            response_status = -1
+
+        logger.info('Generating Answer: LTP, Dashboard')
+        summary = gen_summary(
+            self.SUB_FEATURE,
+            response_long,
+            response,
+            question,
+            'gen_result_summary_dashboard.txt',
+            None,
+            help_msg,
+            False,
+            self.llm_session
+        )
+
+        if response_status == 0:
+            reference = f'\n\n >Reference: the generated KQL query used to get the data:\n\n```\n{query_gen_res}\n```'
+            # push to frontend
+            push_frontend_event(reference)
+
+        info_dict = {}
+        info_dict["s0_query_gen"] = {"res": query_gen_res, "status": query_gen_status}
+        info_dict["s1_query_exe"] = {"res": self._make_json_serializable(response), "status": response_status}
+        return summary, info_dict
+
+    def auto_reject(self, question: str, help_msg):
+        """Auto rejected, unsupported by design."""
+
+        logger.info('Generating Answer: LTP, Auto Rejection')
+        summary = gen_summary(
+            self.SUB_FEATURE,
+            {},
+            {},
+            question,
+            'gen_result_summary_rejection.txt',
+            None,
+            help_msg,
+            False,
+            self.llm_session
+        )
+
+        info_dict = {}
+        return summary, info_dict
+
+    def human_intervention(self, question: str, help_msg):
+        """Handle human intervention for LTP auto rejection."""
+
+        logger.info('Generating Answer: LTP, Human Intervention')
+        summary = gen_summary(
+            self.SUB_FEATURE,
+            {},
+            {},
+            question,
+            'gen_result_summary_human.txt',
+            None,
+            help_msg,
+            False,
+            self.llm_session
         )
-        response, response_status = KQL.execute_return_data(query_gen_res)
-        response_long = {"query_generated": query_gen_res, "response_from_query_execution": response}
-        logger.info(f'Kusto Query execution result: {response}')
-    else:
-        response = {}
-        response_long = {"query_generated": "query generation failed, please perform manual investigation", "response_from_query_execution": response}
-        response_status = -1
-
-    logger.info('Generating Answer: LTP, Dashboard')
-    summary = gen_summary(
-        SUB_FEATURE,
-        response_long,
-        response,
-        question,
-        'gen_result_summary_dashboard.txt',
-        None,
-        help_msg,
-        False,
-    )
-
-    if response_status == 0:
-        reference = f'\n\n >Reference: the generated KQL query used to get the data:\n\n```\n{query_gen_res}\n```'
-        summary += reference
-
-    info_dict = {}
-    info_dict["s0_query_gen"] = {"res": query_gen_res, "status": query_gen_status}
-    info_dict["s1_query_exe"] = {"res": make_json_serializable(response), "status": response_status}
-    return summary, info_dict
-
-
-def ltp_auto_reject(question: str, help_msg):
-    """Auto rejected, unsupported by design."""
-
-    logger.info('Generating Answer: LTP, Auto Rejection')
-    summary = gen_summary(
-        SUB_FEATURE,
-        {},
-        {},
-        question,
-        'gen_result_summary_rejection.txt',
-        None,
-        help_msg,
-        False,
-    )
-
-    info_dict = {}
-    return summary, info_dict
-
-
-def ltp_human_intervention(question: str, help_msg):
-    """Handle human intervention for LTP auto rejection."""
-
-    logger.info('Generating Answer: LTP, Human Intervention')
-    summary = gen_summary(
-        SUB_FEATURE,
-        {},
-        {},
-        question,
-        'gen_result_summary_human.txt',
-        None,
-        help_msg,
-        False,
-    )
-
-    info_dict = {}
-    return summary, info_dict
-
-def make_json_serializable(data):
-    """
-    Recursively converts non-JSON serializable objects within a data structure.
-    """
-    if isinstance(data, (list, tuple)):
-        return [make_json_serializable(item) for item in data]
-    elif isinstance(data, dict):
-        return {key: make_json_serializable(value) for key, value in data.items()}
-    elif isinstance(data, datetime.timedelta):
-        # Convert timedelta to total seconds (a float)
-        return data.total_seconds()
-    else:
-        # Return the object as is if it's already serializable
-        return data
\ No newline at end of file
+
+        info_dict = {}
+        return summary, info_dict
+
+    @staticmethod
+    def _extract_job_metadata(resp):
+        """Extract job metadata from OpenPAI response."""
+        if isinstance(resp, dict) and 'data' in resp:
+            resp_data = resp['data']
+            job_metadatas = {
+                f'{job["username"]}~{job["name"]}': {
+                    k: v for k, v in job.items() if k not in ['debugId', 'subState', 'executionType', 'appExitCode']
+                }
+                for job in resp_data
+            }
+        else:
+            job_metadatas = None
+        return job_metadatas
+
+    @staticmethod
+    def _get_brief_job_metadata(resp):
+        """Get brief job metadata."""
+        if isinstance(resp, dict) and 'data' in resp:
+            resp_data = resp['data']
+            job_metadatas = [f'{job["username"]}~{job["name"]}' for job in resp_data]
+            job_metadatas = job_metadatas[:1000]
+        else:
+            job_metadatas = None
+        return job_metadatas
+
+    @staticmethod
+    def _make_json_serializable(data):
+        """
+        Recursively converts non-JSON serializable objects within a data structure.
+        """
+        if isinstance(data, (list, tuple)):
+            return [LTP._make_json_serializable(item) for item in data]
+        elif isinstance(data, dict):
+            return {key: LTP._make_json_serializable(value) for key, value in data.items()}
+        elif isinstance(data, datetime.timedelta):
+            # Convert timedelta to total seconds (a float)
+            return data.total_seconds()
+        else:
+            # Return the object as is if it's already serializable
+            return data
diff --git a/src/copilot-chat/src/copilot_agent/ltp/ltp_dashboard.py b/src/copilot-chat/src/copilot_agent/ltp/ltp_dashboard.py
index ff545518..11d473f6 100644
--- a/src/copilot-chat/src/copilot_agent/ltp/ltp_dashboard.py
+++ b/src/copilot-chat/src/copilot_agent/ltp/ltp_dashboard.py
@@ -13,9 +13,7 @@
 from ..utils.rag import QueryGeneratorRAG
 from ..utils.logger import logger
 
-from ..config import DATA_DIR
-
-model = LLMSession()
+from ..config import DATA_DIR, AGENT_MINIMAL_ON
 
 
 # Document preparation utilities
@@ -114,6 +112,8 @@ def _init_instance(cls):
         kql_sample_docs_data = DocPrepare.get_txt_as_list_hashtag(KUSTO_SAMPLE_DATA_FILE)
         kql_knowledge_docs = DocPrepare.get_txt_as_list(KUSTO_KNOWLEDGE_FILE)
         kql_schema_docs = kql_table_docs + kql_operator_docs + kql_sample_docs + kql_sample_docs_data + kql_knowledge_docs
+        if AGENT_MINIMAL_ON:
+            kql_schema_docs = []
         logger.info(f'length of kql_schema_docs: {len(kql_schema_docs)}')
         for doc in kql_schema_docs:
             logger.info(doc[:50])
diff --git a/src/copilot-chat/src/copilot_agent/prompts/classification/f4/classify.txt b/src/copilot-chat/src/copilot_agent/prompts/classification/f4/classify.txt
new file mode 100644
index 00000000..604e4c1d
--- /dev/null
+++ b/src/copilot-chat/src/copilot_agent/prompts/classification/f4/classify.txt
@@ -0,0 +1,99 @@
+Your task is to contextualize the user's question, and understand the user's intention from the question they asked, and assign a object and a concern as an indicator of the type of the question.
+
+Please output  in the following format:
+```json
+{
+  "new_question": "",
+  "lv0_object": "",
+  "lv1_concern": ""
+}
+```
+
+# method for contextualize the user's question
+
+you need to first classify the relationship between this_question and last_question as either a 'direct follow-up' or a 'new query'. A 'direct follow-up' is a question that uses conversational shortcuts or comparative phrases (e.g., "how about X?", "what about X?", "why?") to build directly upon the previous question's intent. If it's classified as a 'direct follow-up', you should complement the missing language concepts (intent, specific entities, and attributes) from the last question. If it's a 'new query' (meaning it introduces a new topic, metric, or entity, even if it shares a common attribute like a week number), you must NOT contextualize it and should return the question as is.
+
+# method for identifying the object
+
+Four question objects are supported:
+
+8. [training_platform]: it means the user wants to ask for guidance of how to use a model training platform, specifically the Lucia Training Platform, or ask for detailed information about an already launched training job, usually such questions can be answered by searching or querying the detailed logs, debugging informations provided by the training platform.
+3. [general]: it means user's can be answered fully using public information or general knowledge, without the need to ask for more context input from user.
+9. [persona]: it means the user wants to understand the persona of you, what features you can provide, what capabilities you can serve.
+4. [others]: it means user's prompt is not [training_platform] nor [general] nor [persona]. Note that the above types have specific definition. If user question is not belong to [training_platform] nor [general] nor [persona], it should be [others].
+
+below knowldge describes some key concepts commonly asked in the training_platform object questions:
+knowledge #1:
+single node trainig, distributed training, how to config a job, how to submit an issue related to the training platform, etc.
+
+knowledge #2:
+the user manual of LTP (Lucia Training Platform) provides a detailed usage guidance to assist user better utilize the resource to perform single or distributed training task per their request.
+
+knowledge #3: the whole user manual consists several sections of documentation, with each focusing on explaining the detail instructions, configurations, information. Available sections are:
+- [Quick Start](./UserManual/quickstart.md): A guide to help you quickly get started with the platform and submit a "hello world" job.
+- [How to Use Docker Image](./UserManual/docker-images.md): Instructions on using Docker images in your jobs.
+- [How to Manage Data and Code](./UserManual/use-data.md): Guidance on managing data and code within your jobs.
+- [How to Write Job Configuration](./UserManual/job-config.md): Detailed instructions for configuring jobs and distributed jobs.
+- [Job Priorities](./UserManual/job-priorities.md): Explanation of job priority types and how to submit jobs with specific priorities.
+- [VC Allocation](./UserManual/vc-allocation.md): Guidelines for managing VC allocation requests and assignment change notifications.
+- [Notification and Monitoring](./UserManual/notification.md): Information on the platform's notification and monitoring features.
+- [Troubleshooting](./UserManual/troubleshooting.md): A troubleshooting guide for common issues and how to get platform support.
+- [Email Templates, UserGroup Admin](./UserManual/email-templates/email-templates-user.md): Templates for UserGroup Admins to request VC allocations, production priority job submissions and integrating private Azure storage blob.
+- [Email Templates, Lucia Training Platform Admin](./UserManual/email-templates/email-templates-ltp.md): Templates for Lucia Training Platform Admins to acknowledge, complete, and notify about VC allocation and assignment changes.
+
+
+knowledge #4: there is a Lucia Training Platform Dashboard which includes data to let platform administrators and platform users to get detailed information about the performance, availability, failures, and resource utilization:
+- Cluster Capacity Status
+- Availability Status
+- Node Health and Failures
+- Job Interruption Analysis
+- Node Recycle Times
+- Mean Time Between Failures (MTBF)
+- Daily Node Runtime Failures
+- Failure Categorization and Reasons
+- Daily Node Failure Occurrences
+
+# method for identifying the concern:
+
+Seven concerns are supported:
+
+1. [cluster_job_metrics]: it means the key concern of the user is to understand the current status overview of an object. Usually it requires perform root cause analysis to answer user's concerns.
+2. [job_metadata]: it means the key concern of the user is to understand the root cause of an issue or problem about an object. Usually it requires perform root cause analysis to answer user's concerns.
+3. [user_manual]: it means the key concern of the user is to understand a general concept on what is Lucia Training Platform, or LTP, or training platform, usually the question embedded with a intention to understand how to use the platform.
+4. [dashboard]: it means the key concern of the user is to get detailed information about the performance, availability, failures, and resource utilization of a model training platform.
+5. [auto_rejection]: it means the key concern of the user is not supported by the design of the trainig platform, the examples below provides some unsupported feature for you to compare with.
+6. [human_intervention]: it means the key concern of the user is: not straightforward and may involve multiple potential causes, requiring human judgment to diagnose and resolve, or highlights failures or malfunctions in systems, resources, or infrastructure that cannot be automatically diagnose (e.g, ssh connection issue, GPU failure, node failure, job failure, portal crash, etc.), or explicitly or implicitly requests human assistance to investigate, diagnose, or fix the problem.
+0. [others]: it means the key concern of the user is too general to abstrct.
+
+below knowldge describes some key concepts commonly asked in the cluster_job_metrics concern questions:
+knowledge #1: metrics backend is Prometheus REST API server
+knowledge #2: metrics could be cluster metrics, job metrics, filtered by specific virtual cluster (VC) or user, etc.
+
+below knowldge describes some key concepts commonly asked in the user_manual concern questions:
+knowledge #1: the user manual of LTP (Lucia Training Platform) provides a detailed usage guidance to assist user better utilize the resource to perform single or distributed training task per their request.
+knowledge #2: the whole user manual consists several sections of documentation, with each focusing on explaining the detail instructions, configurations, information. Available sections are:
+- [Quick Start](./UserManual/quickstart.md): A guide to help you quickly get started with the platform and submit a "hello world" job.
+- [How to Use Docker Image](./UserManual/docker-images.md): Instructions on using Docker images in your jobs.
+- [How to Manage Data and Code](./UserManual/use-data.md): Guidance on managing data and code within your jobs.
+- [How to Write Job Configuration](./UserManual/job-config.md): Detailed instructions for configuring jobs and distributed jobs.
+- [Job Priorities](./UserManual/job-priorities.md): Explanation of job priority types and how to submit jobs with specific priorities.
+- [VC Allocation](./UserManual/vc-allocation.md): Guidelines for managing VC allocation requests and assignment change notifications.
+- [Notification and Monitoring](./UserManual/notification.md): Information on the platform's notification and monitoring features.
+- [Troubleshooting](./UserManual/troubleshooting.md): A troubleshooting guide for common issues and how to get platform support.
+- [Email Templates, UserGroup Admin](./UserManual/email-templates/email-templates-user.md): Templates for UserGroup Admins to request VC allocations, production priority job submissions and integrating private Azure storage blob.
+- [Email Templates, Lucia Training Platform Admin](./UserManual/email-templates/email-templates-ltp.md): Templates for Lucia Training Platform Admins to acknowledge, complete, and notify about VC allocation and assignment changes.
+knowledge #3: [How to Write Job Configuration] section details below topics: Parameters and Secrets, Distributed Settings, Environment Variables for Distributed Jobs, Job Exit Specifications
+
+below knowledge describes what dashboard data provides:
+knowledge #4: there is a Lucia Training Platform Dashboard which includes data to let platform administrators and platform users to get detailed information about the performance, availability, failures, and resource utilization:
+- Cluster Capacity Status
+- Availability Status
+- Node Health and Failures
+- Job Interruption Analysis
+- Node Recycle Times
+- Mean Time Between Failures (MTBF)
+- Daily Node Runtime Failures
+- Failure Categorization and Reasons
+- Daily Node Failure Occurrences
+
+
diff --git a/src/copilot-chat/src/copilot_agent/prompts/classification/f4/examples.txt b/src/copilot-chat/src/copilot_agent/prompts/classification/f4/examples.txt
new file mode 100644
index 00000000..43bf2da5
--- /dev/null
+++ b/src/copilot-chat/src/copilot_agent/prompts/classification/f4/examples.txt
@@ -0,0 +1,582 @@
+# Following are some examples
+
+## set a
+
+[Example a.1]
+this_question: how about week 27?
+last_question: what is the failure node count for week 28
+thinking: "how about week 27?" would be classified as a direct follow-up. It's a comparative phrase ("how about") and the only new entity is "week 27." The model would then infer the intent and attributes from the last question ("what is the failure node count for...") and correctly build the new question.
+output:
+```json
+{
+  "new_question": "what is the failure node count for week 27?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example a.2]
+this_question: what can you do?
+last_question: what is the failure node count for week 28
+output:
+```json
+{
+  "new_question": "what can you do?",
+  "lv0_object": "9",
+  "lv1_concern": "0"
+}
+```
+
+[Example a.3]
+this_question: What is the average job duration for hardware failures in week 30?
+last_question: What is the percentage of "Available VMs (Empty)" in the LON64PrdGPC01 cluster in week 30?
+thinking: "What is the average job duration for hardware failures in week 30?" would be classified as a new query. While it shares "week 30" with the last question, its core intent (average job duration) and main entity (hardware failures) are completely different from the last question's intent (percentage of available VMs) and entity (LON64PrdGPC01 cluster). The new logic would correctly decide not to apply any contextualization.
+output:
+```json
+{
+  "new_question": "What is the average job duration for hardware failures in week 30?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+## set b
+
+[Example b.1]
+this_question
+Is distributed training supported?
+output:
+```json
+{
+  "new_question": "Is distributed training supported?",
+  "lv0_object": "8",
+  "lv1_concern": "3"
+}
+```
+
+[Example b.3]
+this_question
+what is the weather
+output:
+```json
+{
+  "new_question": "what is the weather",
+  "lv0_object": "4",
+  "lv1_concern": "0"
+}
+```
+
+[Example b.7]
+this_question
+What is Chrismax
+output:
+```json
+{
+  "new_question": "What is Chrismax",
+  "lv0_object": "3",
+  "lv1_concern": "0"
+}
+```
+
+[Example b.8]
+this_question
+Hi!
+output:
+```json
+{
+  "new_question": "Hi!",
+  "lv0_object": "3",
+  "lv1_concern": "0"
+}
+```
+
+[Example b.9]
+this_question
+What can you do?
+output:
+```json
+{
+  "new_question": "What can you do?",
+  "lv0_object": "9",
+  "lv1_concern": "0"
+}
+```
+
+[Example b.10]
+this_question
+What is the average availability ratio for the cluster CYS13PrdGPC02 in Week 27?
+output:
+```json
+{
+  "new_question": "What is the average availability ratio for the cluster CYS13PrdGPC02 in Week 27?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.11]
+this_question
+Which cluster had the highest percentage of allocated VMs in Week 22?
+output:
+```json
+{
+  "new_question": "Which cluster had the highest percentage of allocated VMs in Week 22?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.12]
+this_question
+What was the most common reason for node failures?
+output:
+```json
+{
+  "new_question": "What was the most common reason for node failures?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.13]
+this_question
+What is the Mean Time Between Failures (MTBF) for the cluster CYS13PrdGPC02 in Week 27?
+output:
+```json
+{
+  "new_question": "What is the Mean Time Between Failures (MTBF) for the cluster CYS13PrdGPC02 in Week 27?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.14]
+this_question
+What is the average reaction time for hardware failures in Week 27?
+output:
+```json
+{
+  "new_question": "What is the average reaction time for hardware failures in Week 27?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.15]
+this_question
+How many job interruptions occurred due to hardware failures, and what was the average duration of these interruptions?
+output:
+```json
+{
+  "new_question": "How many job interruptions occurred due to hardware failures, and what was the average duration of these interruptions?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.16]
+this_question
+Which cluster had the highest percentage of unallocatable VMs in Week 27?
+output:
+```json
+{
+  "new_question": "Which cluster had the highest percentage of unallocatable VMs in Week 27?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.17]
+this_question
+What was the total number of hardware-related node failures in Week 27 across all endpoints?
+output:
+```json
+{
+  "new_question": "What was the total number of hardware-related node failures in Week 27 across all endpoints?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.18]
+this_question
+What was the node recycle time for cordon nodes in the CYS13PrdGPC02 cluster in Week 25?
+output:
+```json
+{
+  "new_question": "What was the node recycle time for cordon nodes in the CYS13PrdGPC02 cluster in Week 25?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.19]
+this_question
+What was the average availability ratio for the cluster CYS13PrdGPC02 in Week 27?
+output:
+```json
+{
+  "new_question": "What was the average availability ratio for the cluster CYS13PrdGPC02 in Week 27?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.20]
+this_question
+How many deallocated nodes were reported for the cluster SJC24PrdGPC03?
+output:
+```json
+{
+  "new_question": "How many deallocated nodes were reported for the cluster SJC24PrdGPC03?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example b.21]
+this_question
+introduce yourself
+output:
+```json
+{
+  "new_question": "introduce yourself",
+  "lv0_object": "9",
+  "lv1_concern": "0"
+}
+```
+
+[Example b.22]
+this_question
+There is no mechanism to prevent Virtual Cluster (VC) resource abuse, such as requesting an excessive number of GPUs.
+output:
+```json
+{
+  "new_question": "There is no mechanism to prevent Virtual Cluster (VC) resource abuse, such as requesting an excessive number of GPUs.",
+  "lv0_object": "8",
+  "lv1_concern": "5"
+}
+```
+
+[Example b.23]
+this_question
+Users may be duplicating efforts by building Docker images that have already been created by others.
+output:
+```json
+{
+  "new_question": "Users may be duplicating efforts by building Docker images that have already been created by others.",
+  "lv0_object": "8",
+  "lv1_concern": "5"
+}
+```
+
+[Example b.24]
+this_question
+I have been trying to get 4 nodes and successfully been allocated. However, it gets stuck three nodes are running and one keeps waiting -- blocking the whole process. Can you please take a look at it?
+output:
+```json
+{
+  "new_question": "I have been trying to get 4 nodes and successfully been allocated. However, it gets stuck three nodes are running and one keeps waiting -- blocking the whole process. Can you please take a look at it?",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.25]
+this_question
+seems that we meet some network issue today, some nodes can connect to archive.ubuntu.com but some cannot, could you help to check? Err:12 jammy InRelease  Could not connect to archive.ubuntu.com:80 (185.125.190.81), connection timed out Could not connect to archive.ubuntu.com:80 (91.189.91.82), connection timed out Could not connect to archive.ubuntu.com:80 (185.125.190.83), connection timed out Could not connect to archive.ubuntu.com:80 (185.125.190.82), connection timed out Could not connect to archive.ubuntu.com:80 (91.189.91.83), connection timed out
+output:
+```json
+{
+  "new_question": "seems that we meet some network issue today, some nodes can connect to archive.ubuntu.com but some cannot, could you help to check? Err:12 jammy InRelease  Could not connect to archive.ubuntu.com:80 (185.125.190.81), connection timed out Could not connect to archive.ubuntu.com:80 (91.189.91.82), connection timed out Could not connect to archive.ubuntu.com:80 (185.125.190.83), connection timed out Could not connect to archive.ubuntu.com:80 (185.125.190.82), connection timed out Could not connect to archive.ubuntu.com:80 (91.189.91.83), connection timed out",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.26]
+this_question
+We meet this issue, seems the portal is crashed.
+output:
+```json
+{
+  "new_question": "We meet this issue, seems the portal is crashed.",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.27]
+this_question
+We have a 16 node job failed and we have try to resume several times, each time failed seems because of communication, could you help to find if there are some bad nodes in it? These are job links:
+output:
+```json
+{
+  "new_question": "We have a 16 node job failed and we have try to resume several times, each time failed seems because of communication, could you help to find if there are some bad nodes in it? These are job links:",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.28]
+this_question
+When I try to submit job this afternoon, the portal says Error
+output:
+```json
+{
+  "new_question": "When I try to submit job this afternoon, the portal says Error",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.29]
+this_question
+we have a 16-node job that seems to be hanging during startup ‚Äî the GPU utilization stays at 0. We've retried multiple times on the same 16 nodes, but the issue persists each time. Could you please help check if there might be any issues with these nodes? here is the job link
+output:
+```json
+{
+  "new_question": "we have a 16-node job that seems to be hanging during startup ‚Äî the GPU utilization stays at 0. We've retried multiple times on the same 16 nodes, but the issue persists each time. Could you please help check if there might be any issues with these nodes? here is the job link",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.30]
+this_question
+Thanks to the powerful Lucia platform, we have successfully started several 16-node experiments. However, when some experiments are stopped, it seems that the portal does not show that these nodes have been released. This may also cause the jobs submitted later to be waiting all the time. And is it possible to replace the 3 bad nodes in the cluster so that we can start an additional experiment or scale up the current experiment, which will speed up our experiment progress.
+output:
+```json
+{
+  "new_question": "Thanks to the powerful Lucia platform, we have successfully started several 16-node experiments. However, when some experiments are stopped, it seems that the portal does not show that these nodes have been released. This may also cause the jobs submitted later to be waiting all the time. And is it possible to replace the 3 bad nodes in the cluster so that we can start an additional experiment or scale up the current experiment, which will speed up our experiment progress.",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.31]
+this_question
+It seems that there are some potentially faulty nodes on the <code>rstar</code> VC. When we submit multi-node jobs, they may randomly land on these problematic nodes, causing job failures‚Äîeven though the codebase remains unchanged. Here are two job links that may help illustrate the issue: Would it be possible to detect these potentially bad nodes and report them for maintenance? Thanks for your help!
+output:
+```json
+{
+  "new_question": "It seems that there are some potentially faulty nodes on the <code>rstar</code> VC. When we submit multi-node jobs, they may randomly land on these problematic nodes, causing job failures‚Äîeven though the codebase remains unchanged. Here are two job links that may help illustrate the issue: Would it be possible to detect these potentially bad nodes and report them for maintenance? Thanks for your help!",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.32]
+this_question
+sigma-s-mi300 shows it has been fully used (100%, so no more jobs can be submitted), but just half of the GPUs and CPUs shown been occupied. See the attached screenshot.
+output:
+```json
+{
+  "new_question": "sigma-s-mi300 shows it has been fully used (100%, so no more jobs can be submitted), but just half of the GPUs and CPUs shown been occupied. See the attached screenshot.",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.33]
+this_question
+This job encountered "No CUDA GPUs are available".
+output:
+```json
+{
+  "new_question": "This job encountered "No CUDA GPUs are available".",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+[Example b.34]
+this_question
+As far as we know, there are three ways for us to communicate with blob storage: Using storage configs for read and write a public blob container. Using blobfuse2 to manually mount our private blob containers for rw. Using azcopy to pre-download necessary files and post-upload results to some specific blob containers. When we are training small models like less than 2B model, the problem may not so obvious and may not cause timeout. But recently, we are trying 7B and 10B models, whose ckpt will be 110G+ (together with the sliced states of each GPU), we may need to continually start with a previously trained ckpt and save the updated ckpts periodically. This demand may require not slow IO speed with blob. We try to save a ckpt after the second step in a training experiment. The current results are: Saving 7B model with method 1 will be timeout:  Saving 7B model with method 2 is okay, Method 3 require too much labour for saving ckpts periodically. what is the Best Practice of Using Blob Storage?"
+output:
+```json
+{
+  "new_question": "As far as we know, there are three ways for us to communicate with blob storage: Using storage configs for read and write a public blob container. Using blobfuse2 to manually mount our private blob containers for rw. Using azcopy to pre-download necessary files and post-upload results to some specific blob containers. When we are training small models like less than 2B model, the problem may not so obvious and may not cause timeout. But recently, we are trying 7B and 10B models, whose ckpt will be 110G+ (together with the sliced states of each GPU), we may need to continually start with a previously trained ckpt and save the updated ckpts periodically. This demand may require not slow IO speed with blob. We try to save a ckpt after the second step in a training experiment. The current results are: Saving 7B model with method 1 will be timeout:  Saving 7B model with method 2 is okay, Method 3 require too much labour for saving ckpts periodically. what is the Best Practice of Using Blob Storage?"",
+  "lv0_object": "8",
+  "lv1_concern": "6"
+}
+```
+
+## set c
+
+[Example c.1]
+this_question:
+what is the idle gpu counts for virtual cluster (vc): mi300s?
+output:
+```json
+{
+  "new_question": "what is the idle gpu counts for virtual cluster (vc): mi300s?",
+  "lv0_object": "8",
+  "lv1_concern": "1"
+}
+```
+
+[Example c.2]
+this_question:
+query allocated gpu counts, end time is now, offset is 1 day
+output:
+```json
+{
+  "new_question": "query allocated gpu counts, end time is now, offset is 1 day",
+  "lv0_object": "8",
+  "lv1_concern": "1"
+}
+
+[Example c.3]
+this_question:
+query job meta data
+output:
+```json
+{
+  "new_question": "query job meta data",
+  "lv0_object": "8",
+  "lv1_concern": "2"
+}
+
+[Example c.4]
+this_question:
+how to submit a distributed training job?
+output:
+```json
+{
+  "new_question": "how to submit a distributed training job?",
+  "lv0_object": "8",
+  "lv1_concern": "3"
+}
+
+[Example c.5]
+this_question:
+write/create a new benchmark for ...
+output:
+```json
+{
+  "new_question": "write/create a new benchmark for ...",
+  "lv0_object": "4",
+  "lv1_concern": "0"
+}
+```
+
+[Example c.6]
+this_question:
+query a list of running or history jobs
+output:
+```json
+{
+  "new_question": "query a list of running or history jobs",
+  "lv0_object": "8",
+  "lv1_concern": "1"
+}
+```
+
+[Example c.31]
+this_question:
+I'm new to the platform, it would be nice if I could access the job via ssh for debugging, currently I can only modify the yaml file and rerun the job for debugging, which is a bit cumbersome.
+output:
+```json
+{
+  "new_question": "I'm new to platform, it would be nice if I could access the job via ssh for debugging, currently I can only modify the yaml file and rerun the job for debugging, which is a bit cumbersome.",
+  "lv0_object": "8",
+  "lv1_concern": "3"
+}
+```
+
+[Example c.32]
+this_question:
+Is there any jump machine that can be used to ssh to the MI300x machine?
+output:
+```json
+{
+  "new_question": "Is there any jump machine that can be used to ssh to the MI300x machine?",
+  "lv0_object": "8",
+  "lv1_concern": "3"
+}
+```
+
+[Example c.33]
+this_question:
+I conducted experiments using the same environment and dataset on both a single node and multiple nodes. From the experiments, I observed the following:</p>\n<ol>\n<li>The total training steps for a single node are 8,638, with an estimated training time of around 52 hours.<a href=\"https://we.openpai.org/job-detail.html?username=xinzhang3&amp;jobName=qwencoder25-7b-cl-powershell-1n-ts-speed-2\" rel=\"noreferrer noopener\" title=\"https://we.openpai.org/job-detail.html?username=xinzhang3&amp;jobName=qwencoder25-7b-cl-powershell-1n-ts-speed-2\" target=\"_blank\">Platform for AI</a></li><li>The total training steps for 4 nodes are 2,159, with an estimated training time of around 77 hours. <a href=\"https://we.openpai.org/job-detail.html?username=xinzhang3&amp;jobName=qwencoder25-7b-cl-powershell-4n-ts-2159\" rel=\"noreferrer noopener\" title=\"https://we.openpai.org/job-detail.html?username=xinzhang3&amp;jobName=qwencoder25-7b-cl-powershell-4n-ts-2159\" target=\"_blank\">Platform for AI</a></li></ol>\n<p>I am wondering whether there might be some communication overhead causing the training time on multiple nodes to be higher than on a single node. Thank you!
+output:
+```json
+{
+  "new_question": "I conducted experiments using the same environment and dataset on both a single node and multiple nodes. From the experiments, I observed the following:</p>\n<ol>\n<li>The total training steps for a single node are 8,638, with an estimated training time of around 52 hours.<a href=\"https://we.openpai.org/job-detail.html?username=xinzhang3&amp;jobName=qwencoder25-7b-cl-powershell-1n-ts-speed-2\" rel=\"noreferrer noopener\" title=\"https://we.openpai.org/job-detail.html?username=xinzhang3&amp;jobName=qwencoder25-7b-cl-powershell-1n-ts-speed-2\" target=\"_blank\">Platform for AI</a></li><li>The total training steps for 4 nodes are 2,159, with an estimated training time of around 77 hours. <a href=\"https://we.openpai.org/job-detail.html?username=xinzhang3&amp;jobName=qwencoder25-7b-cl-powershell-4n-ts-2159\" rel=\"noreferrer noopener\" title=\"https://we.openpai.org/job-detail.html?username=xinzhang3&amp;jobName=qwencoder25-7b-cl-powershell-4n-ts-2159\" target=\"_blank\">Platform for AI</a></li></ol>\n<p>I am wondering whether there might be some communication overhead causing the training time on multiple nodes to be higher than on a single node. Thank you!",
+  "lv0_object": "8",
+  "lv1_concern": "1"
+}
+```
+
+[Example c.34]
+this_question:
+Users may not be familiar with certain types of GPUs (especially AMD GPUs). Launching jobs on these GPUs often results in a higher failure rate due to lack of familiarity.
+output:
+```json
+{
+  "new_question": "Users may not be familiar with certain types of GPUs (especially AMD GPUs). Launching jobs on these GPUs often results in a higher failure rate due to lack of familiarity.",
+  "lv0_object": "8",
+  "lv1_concern": "3"
+}
+```
+
+[Example c.35]
+this_question:
+The behavior of different job priorities is not transparent to users, making it difficult for them to make informed decisions.
+output:
+```json
+{
+  "new_question": "The behavior of different job priorities is not transparent to users, making it difficult for them to make informed decisions.",
+  "lv0_object": "8",
+  "lv1_concern": "3"
+}
+```
+
+[Example c.36]
+this_question:
+Can I setup a SSH connection to the node?
+output:
+```json
+{
+  "new_question": "Can I setup a SSH connection to the node?",
+  "lv0_object": "8",
+  "lv1_concern": "3"
+}
+```
+
+[Example c.37]
+this_question:
+Users are unable to manage their expectations regarding job completion times due to a lack of visibility into job progress or estimated end times.
+output:
+```json
+{
+  "new_question": "Users are unable to manage their expectations regarding job completion times due to a lack of visibility into job progress or estimated end times.",
+  "lv0_object": "8",
+  "lv1_concern": "5"
+}
+```
+
+[Example c.38]
+this_question:
+Which job had the longest duration in week 30, and what was its exit reason?
+output:
+```json
+{
+  "new_question": "Which job had the longest duration in week 30, and what was its exit reason?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
+
+[Example c.39]
+this_question:
+What are the cluster names in pai-wcu endpoint?
+output:
+```json
+{
+  "new_question": "What are the cluster names in pai-wcu endpoint?",
+  "lv0_object": "8",
+  "lv1_concern": "4"
+}
+```
\ No newline at end of file
diff --git a/src/copilot-chat/src/copilot_agent/prompts/help/infrawise_help_f3.json b/src/copilot-chat/src/copilot_agent/prompts/help/infrawise_help.json
similarity index 100%
rename from src/copilot-chat/src/copilot_agent/prompts/help/infrawise_help_f3.json
rename to src/copilot-chat/src/copilot_agent/prompts/help/infrawise_help.json
diff --git a/src/copilot-chat/src/copilot_agent/utils/__init__.py b/src/copilot-chat/src/copilot_agent/utils/__init__.py
index 9f2b2ea6..d0161dad 100644
--- a/src/copilot-chat/src/copilot_agent/utils/__init__.py
+++ b/src/copilot-chat/src/copilot_agent/utils/__init__.py
@@ -5,7 +5,7 @@
 
 from .authentication import AuthenticationManager
 from .classify import QuestionClassifier
-from .conversation_manager import contextualize_question
+from .conversation_manager import Contextualizer
 from .dcw import dcw_parser, extract_dcw_from_history, gen_dcw, parse_and_align_dcw
 from .kql_executor import KustoExecutor
 from .logger import (
@@ -31,6 +31,10 @@
 from .promql import (
     execute_promql_query,
 )
+from .push_frontend import(
+    push_frontend_event,
+    set_thread_llm_session,
+)
 from .query import (
     gen_kusto_query_fallback_pseudo,
     gen_kusto_query_pseudo,
@@ -43,7 +47,7 @@
     RestAPIClient,
 )
 from .smart_help import (
-    gen_smart_help,
+    SmartHelp,
 )
 from .sql import (
     SQLManager,
@@ -63,6 +67,7 @@
 __all__ = [
     'DCW',
     'AuthenticationManager',
+    'Contextualizer',
     'Customer',
     'Design',
     'KustoExecutor',
@@ -71,9 +76,9 @@
     'PowerBIClient',
     'QuestionClassifier',
     'QueryGeneratorRAG',
+    'SmartHelp',
     'SQLManager',
     'RestAPIClient',
-    'contextualize_question',
     'dcw_parser',
     'execute_openpai_query',
     'execute_promql_query',
@@ -85,7 +90,6 @@
     'gen_kusto_query_fallback_pseudo',
     'gen_kusto_query_pseudo',
     'gen_size_range',
-    'gen_smart_help',
     'gen_sql_query',
     'gen_summary',
     'get_current_unix_timestamp',
@@ -94,6 +98,8 @@
     'is_valid_json',
     'logger',
     'parse_and_align_dcw',
+    'push_frontend_event',
     'retry_function',
     'save_to_csv',
+    'set_thread_llm_session',
 ]
diff --git a/src/copilot-chat/src/copilot_agent/utils/authentication.py b/src/copilot-chat/src/copilot_agent/utils/authentication.py
index ba1f4f8b..3d532a99 100644
--- a/src/copilot-chat/src/copilot_agent/utils/authentication.py
+++ b/src/copilot-chat/src/copilot_agent/utils/authentication.py
@@ -6,6 +6,7 @@
 import os
 import requests
 import urllib.parse
+import threading
 
 from datetime import datetime, timezone
 
@@ -20,6 +21,8 @@ def __init__(self, expiration_ms: int = 3600000):
         self.restserver_url = os.getenv('RESTSERVER_URL', '')
         valid_vcs_env = os.getenv('COPILOT_VALID_VCS', 'admin,superuser')
         self.valid_vcs = [g.strip() for g in valid_vcs_env.split(',') if g.strip()]
+        # Add thread safety for authentication state
+        self._auth_lock = threading.Lock()
 
     def sanitize_username(self, username: str) -> str:
         """Sanitize the username by URL-encoding it to prevent path traversal or injection attacks."""
@@ -71,42 +74,45 @@ def set_authenticate_state(self, username: str, token: str) -> None:
         """Set the authentication state for a user, storing admin and virtualCluster info."""
         expires_at = int(datetime.now(timezone.utc).timestamp() * 1000) + self.expiration_ms
         is_admin, virtual_cluster = self.authenticate(username, token)
-        if is_admin is not None and virtual_cluster is not None:
-            self.authenticate_state[username] = {
-                'token': token,
-                'expires_at': expires_at,
-                'is_admin': is_admin,
-                'virtual_cluster': virtual_cluster
-            }
-        else:
-            self.revoke(username)
+        with self._auth_lock:
+            if is_admin is not None and virtual_cluster is not None:
+                self.authenticate_state[username] = {
+                    'token': token,
+                    'expires_at': expires_at,
+                    'is_admin': is_admin,
+                    'virtual_cluster': virtual_cluster
+                }
+            else:
+                self.authenticate_state.pop(username, None)  # Thread-safe removal
 
     def is_authenticated(self, username: str) -> bool:
-        state = self.authenticate_state.get(username)
-        now = int(datetime.now(timezone.utc).timestamp() * 1000)
-        if not state:
-            return False
-        if state['expires_at'] < now:
-            self.revoke(username)
-            return False
-        if "is_admin" not in state:
-            return False
-        if "virtual_cluster" not in state:
-            return False
-        if "is_admin" in state and "virtual_cluster" in state:
-            if state["is_admin"]:
-                # validate pass condition one: user is an admin
-                return True
-            elif not state["is_admin"] and self.get_membership(state["virtual_cluster"]):
-                # validate pass condition two: user is not an admin, but it belongs to a valid virtualCluster
-                return True
-            else:
+        with self._auth_lock:
+            state = self.authenticate_state.get(username)
+            now = int(datetime.now(timezone.utc).timestamp() * 1000)
+            if not state:
+                return False
+            if state['expires_at'] < now:
+                # Expired, remove from state
+                self.authenticate_state.pop(username, None)
+                return False
+            if "is_admin" not in state:
                 return False
-        return False
+            if "virtual_cluster" not in state:
+                return False
+            if "is_admin" in state and "virtual_cluster" in state:
+                if state["is_admin"]:
+                    # validate pass condition one: user is an admin
+                    return True
+                elif not state["is_admin"] and self.get_membership(state["virtual_cluster"]):
+                    # validate pass condition two: user is not an admin, but it belongs to a valid virtualCluster
+                    return True
+                else:
+                    return False
+            return False
 
     def get_membership(self, groups: list) -> bool:
         return any(group in self.valid_vcs for group in groups)
 
     def revoke(self, username: str):
-        if username in self.authenticate_state:
-            del self.authenticate_state[username]
+        with self._auth_lock:
+            self.authenticate_state.pop(username, None)
diff --git a/src/copilot-chat/src/copilot_agent/utils/classify.py b/src/copilot-chat/src/copilot_agent/utils/classify.py
index d52c7a1b..2626645d 100644
--- a/src/copilot-chat/src/copilot_agent/utils/classify.py
+++ b/src/copilot-chat/src/copilot_agent/utils/classify.py
@@ -8,12 +8,21 @@
 from ..config import PROMPT_DIR
 
 from ..utils.logger import logger
-from ..utils.utils import get_prompt_from
+from ..utils.utils import get_prompt_from, extract_json_dict
 
 class QuestionClassifier:
     def __init__(self, version, model):
         self.version = version
         self.model = model
+        if self.version == 'f3':
+            self.lv0_system_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'classification/f3/lv0.txt'))
+            self.lv1_system_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'classification/f3/lv1.txt'))
+        elif self.version == 'f4':
+            self.lv0_system_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'classification/f4/classify.txt'))
+            self.lv1_system_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'classification/f4/examples.txt'))
+        else:
+            self.lv0_system_prompt = None
+            self.lv1_system_prompt = None
 
     def classify_question(self, question: str) -> dict:
         """Classify the question and return a dictionary with the results."""
@@ -31,17 +40,29 @@ def classify_question(self, question: str) -> dict:
     def classifier_lv0(self, question: str) -> str:
         """Classify the user question into several categories."""
         if self.version == 'f3':
-            sys_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'classification/f3/lv0.txt'))
+            resp = self.model.chat(self.lv0_system_prompt, question)
         else:
-            raise ValueError(f'Unsupported version: {self.version}')
-        resp = self.model.chat(sys_prompt, question)
+            resp = '3'  # default to 3
         return resp
 
     def classifier_lv1(self, question: str) -> str:
         """Classify the user question into several categories."""
         if self.version == 'f3':
-            sys_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'classification/f3/lv1.txt'))
-            resp = self.model.chat(sys_prompt, question)
+            resp = self.model.chat(self.lv1_system_prompt, question)
         else:
             resp = '0'  # default to 0
         return resp
+
+    def parse_question(self, this_inquiry: str, last_inquiry: str) -> dict:
+        """Classify the question and return a dictionary with the results."""
+        question_struct = {'new_question': '', 'lv0_object': None, 'lv1_concern': None}
+        resp = self.model.chat(self.lv0_system_prompt + self.lv1_system_prompt, f'this_inquiry: {this_inquiry}, last_inquiry: {last_inquiry}')
+        resp_dict = extract_json_dict(resp, nested=False)
+        if resp_dict:
+            if isinstance(resp_dict, dict):
+                question_struct['new_question'] = resp_dict.get('new_question', '')
+                question_struct['lv0_object'] = resp_dict.get('lv0_object', None)
+                question_struct['lv1_concern'] = resp_dict.get('lv1_concern', None)
+        logger.info(f'Parsed question structure: {question_struct}')
+        return question_struct
+    
diff --git a/src/copilot-chat/src/copilot_agent/utils/conversation_manager.py b/src/copilot-chat/src/copilot_agent/utils/conversation_manager.py
index fbd36cc1..e48289b5 100644
--- a/src/copilot-chat/src/copilot_agent/utils/conversation_manager.py
+++ b/src/copilot-chat/src/copilot_agent/utils/conversation_manager.py
@@ -15,22 +15,25 @@
 from ..utils.llmsession import LLMSession
 from ..utils.utils import get_prompt_from, extract_json_dict
 
-model = LLMSession()
-
-def contextualize_question(question: str, last_question: str | None) -> str:
+class Contextualizer:
     """Contextualizes the current question based on the last question."""
-    logger.info(f"Contextualizing question: '{question}' based on last question: '{last_question}'")
-    if last_question is None:
-        return question
-    else:
-        contextualize_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'dialouge_state_tracking', 'dst.txt'))
-        user_prompt = str({
-            'this_question': question,
-            'last_question': last_question,
-        })
-        new_question_str = model.chat(contextualize_prompt, user_prompt)
-        new_question_dict = extract_json_dict(new_question_str, nested=False)
-        if isinstance(new_question_dict, dict):
-            new_question = new_question_dict.get('new_question', question)
-        logger.info(f"Return: '{new_question}'")
-        return new_question
\ No newline at end of file
+    def __init__(self, llm_session: LLMSession):
+        self.llm_session = llm_session
+        self.contextualize_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'dialouge_state_tracking', 'dst.txt'))
+
+    def contextualize(self, question: str, last_question: str | None) -> str:
+        """Contextualizes the current question based on the last question."""
+        logger.info(f"Contextualizing question: '{question}' based on last question: '{last_question}'")
+        if last_question is None:
+            return question
+        else:
+            user_prompt = str({
+                'this_question': question,
+                'last_question': last_question,
+            })
+            new_question_str = self.llm_session.chat(self.contextualize_prompt, user_prompt)
+            new_question_dict = extract_json_dict(new_question_str, nested=False)
+            if isinstance(new_question_dict, dict):
+                new_question = new_question_dict.get('new_question', question)
+            logger.info(f"Return: '{new_question}'")
+            return new_question
diff --git a/src/copilot-chat/src/copilot_agent/utils/dcw.py b/src/copilot-chat/src/copilot_agent/utils/dcw.py
index 1f6f685f..111727b5 100644
--- a/src/copilot-chat/src/copilot_agent/utils/dcw.py
+++ b/src/copilot-chat/src/copilot_agent/utils/dcw.py
@@ -16,14 +16,11 @@
 from ..utils.types import DCW, Customer, Design
 from ..utils.utils import get_prompt_from
 
-model = LLMSession()
-
-
 # generate dcw (design, criteria, workload) from using question
-def gen_dcw(user_prompt: str, map_existing: bool) -> DCW:
+def gen_dcw(user_prompt: str, map_existing: bool, llm_session: LLMSession) -> DCW:
     """Generate a DCW object from the user's question."""
     sys_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'gen_dcw_prompt.txt'))
-    resp = model.chat(sys_prompt, user_prompt)
+    resp = llm_session.chat(sys_prompt, user_prompt)
     logger.info(f'gen_dcw, user_prompt, is {user_prompt}')
     logger.info(f'gen_dcw, resp, is {resp}')
     if 'target' not in resp.lower() and 'baseline' not in resp.lower():
@@ -41,7 +38,7 @@ def gen_dcw(user_prompt: str, map_existing: bool) -> DCW:
 
         if all(item not in dcw.Workload.lower() for item in revise_json['list']):
             logger.info(f'before revise: {dcw}')
-            dcw.Workload = model.chat(revise_sys_prompt, revise_user_prompt)
+            dcw.Workload = llm_session.chat(revise_sys_prompt, revise_user_prompt)
             logger.info(f'after revise: {dcw}')
 
     return dcw
diff --git a/src/copilot-chat/src/copilot_agent/utils/llmsession.py b/src/copilot-chat/src/copilot_agent/utils/llmsession.py
index 69269937..307494c4 100644
--- a/src/copilot-chat/src/copilot_agent/utils/llmsession.py
+++ b/src/copilot-chat/src/copilot_agent/utils/llmsession.py
@@ -6,29 +6,48 @@
 import os
 import time
 import openai
+import requests
+import threading
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
 from ..utils.logger import logger
 
 class LLMSession:
-    """A class to interact with the Azure OpenAI model."""
+    """A thread-safe class for interacting with the Azure OpenAI model."""
+    _global_stream_callback = None  # Class-level attribute for backward compatibility
+    _config_cache = None  # Cache configuration to avoid repeated env var reads
+    _config_lock = threading.Lock()  # Lock for config cache
 
     def __init__(self):
-        # Env Var to set the LLM provider, accepted values are 'openai' or 'azure'
-        self.provider = os.environ.get("COPILOT_LLM_PROVIDER")
-        logger.info(f'COPILOT LLM Endpoint Provider: {self.provider}')
-        self.azure_api_key = os.environ.get("AZURE_OPENAI_API_KEY")
-        self.openai_api_key = os.environ.get("OPENAI_API_KEY")
-        self.endpoint = os.environ.get("COPILOT_LLM_ENDPOINT")
-        self.embedding_url = os.environ.get("COPILOT_EMBEDDING_URL")
-        self.model_name = os.environ.get("COPILOT_LLM_MODEL")
-        self.model_version = os.environ.get("COPILOT_LLM_VERSION")
-        self.embedding_model_name = os.environ.get("COPILOT_EMBEDDING_MODEL")
+        """Initialize a new LLMSession instance per request."""
+        # Use cached config to avoid repeated environment variable reads
+        config = self._get_cached_config()
+        
+        self.provider = config['provider']
+        self.azure_api_key = config['azure_api_key']
+        self.openai_api_key = config['openai_api_key']
+        self.endpoint = config['endpoint']
+        self.model_name = config['model_name']
+        self.model_version = config['model_version']
+        self.embedding_model_name = config['embedding_model_name']
+        
+        # Create a separate session for each instance to avoid blocking
+        self.session = requests.Session()
+        
+        # Per-instance stream callback to avoid cross-user contamination
+        self._instance_stream_callback = None
+        retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
+        adapter = HTTPAdapter(max_retries=retries)
+        self.session.mount('https://', adapter)
+        self.session.headers.update({"Authorization": f"Bearer {self.openai_api_key or self.azure_api_key}"})
+
         if self.provider == "openai":
             self.model = openai.OpenAI(
                 base_url=self.endpoint,
                 api_key=self.openai_api_key
             )
             self.embedding_model = openai.OpenAI(
-                base_url=self.embedding_url,
+                base_url=self.endpoint,
                 api_key=self.openai_api_key
             )
         elif self.provider == "azure":
@@ -38,7 +57,7 @@ def __init__(self):
                 api_version=self.model_version
             )
             self.embedding_model = openai.AzureOpenAI(
-                azure_endpoint=self.embedding_url,
+                azure_endpoint=self.endpoint,
                 api_key=self.azure_api_key,
                 api_version=self.model_version
             )
@@ -46,6 +65,28 @@ def __init__(self):
             logger.error(f'Unsupported LLM provider: {self.provider}')
             raise ValueError(f'Unsupported LLM provider: {self.provider}')
 
+    @classmethod
+    def _get_cached_config(cls):
+        """Get cached configuration or read from environment variables."""
+        if cls._config_cache is None:
+            with cls._config_lock:
+                if cls._config_cache is None:  # Double-check pattern
+                    cls._config_cache = {
+                        'provider': os.environ.get("COPILOT_LLM_PROVIDER"),
+                        'azure_api_key': os.environ.get("AZURE_OPENAI_API_KEY"),
+                        'openai_api_key': os.environ.get("OPENAI_API_KEY"),
+                        'endpoint': os.environ.get("COPILOT_LLM_ENDPOINT"),
+                        'model_name': os.environ.get("COPILOT_LLM_MODEL"),
+                        'model_version': os.environ.get("COPILOT_LLM_VERSION"),
+                        'embedding_model_name': os.environ.get("COPILOT_EMBEDDING_MODEL")
+                    }
+                    logger.info(f'COPILOT LLM Endpoint Provider: {cls._config_cache["provider"]}')
+        return cls._config_cache
+
+    def close_session(self):
+        """Close the persistent HTTP session."""
+        self.session.close()
+
     def chat(self, system_prompt, user_prompt):
         """Chat with the language model."""
         msg = [
@@ -112,4 +153,139 @@ def get_embedding(self, text):
                 raise ValueError(f"Unsupported LLM provider: {self.provider}")
         except Exception as e:
             logger.error(f"Error getting embedding: {e}")
-            raise
\ No newline at end of file
+            raise
+
+    def stream_chat(self, system_prompt, user_prompt):
+        """Stream chat responses from the language model as a generator yielding text chunks.
+
+        This method works with both the OpenAI and Azure OpenAI clients used in this project.
+        It yields incremental text chunks as they arrive from the SDK's streaming API. Callers
+        can iterate over the generator to provide a streaming UX. If streaming fails after
+        retries, a single fallback message chunk will be yielded.
+        """
+        msg = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt}
+        ]
+
+        max_retries = 5
+        backoff = 2  # Initial backoff in seconds
+
+        for attempt in range(max_retries):
+            try:
+                # Start streaming from the provider. The SDK returns an iterator of events.
+                if self.provider == "azure":
+                    stream = self.model.chat.completions.create(
+                        model=self.model_name,
+                        messages=msg,
+                        max_completion_tokens=10000,
+                        stream=True
+                    )
+                elif self.provider == "openai":
+                    stream = self.model.chat.completions.create(
+                        model=self.model_name,
+                        messages=msg,
+                        max_tokens=10000,
+                        stream=True
+                    )
+                else:
+                    logger.error(f"Unsupported LLM provider in stream_chat: {self.provider}")
+                    break
+
+                # Iterate over streaming events and yield text increments.
+                # The exact shape of events can vary between SDK versions, so try a few access patterns.
+                full = ''
+                for event in stream:
+                    chunk = None
+                    try:
+                        # event may be a mapping-like object or an SDK object with attributes
+                        # Try dict-style access first
+                        if isinstance(event, dict):
+                            choices = event.get('choices')
+                        else:
+                            choices = getattr(event, 'choices', None)
+
+                        if choices:
+                            # Support both mapping and attribute access for choice and delta
+                            choice = choices[0]
+
+                            # choice might be a dict or an object
+                            if isinstance(choice, dict):
+                                delta = choice.get('delta', {})
+                                chunk = delta.get('content')
+                            else:
+                                # object-like access
+                                delta = getattr(choice, 'delta', None)
+                                if delta is not None:
+                                    # delta could be a mapping or an object
+                                    if isinstance(delta, dict):
+                                        chunk = delta.get('content')
+                                    else:
+                                        chunk = getattr(delta, 'content', None)
+                    except Exception:
+                        # Be resilient to unexpected event shapes
+                        chunk = None
+
+                    if chunk:
+                        # accumulate to reconstruct full text and yield the full snapshot
+                        full += chunk
+                        # call instance callback first (higher priority), then global callback
+                        try:
+                            cb = self._instance_stream_callback or LLMSession._global_stream_callback
+                            if cb:
+                                cb(full)
+                        except Exception:
+                            logger.debug('Stream callback failed')
+                        yield chunk
+
+                # If stream finishes without exception, stop generator normally
+                return
+
+            except Exception as e:
+                if "429" in str(e):
+                    logger.warning(f"429 Too Many Requests: Retrying in {backoff} seconds (Attempt {attempt + 1}/{max_retries})")
+                    time.sleep(backoff)
+                    backoff *= 2  # Exponential backoff
+                else:
+                    logger.error(f"Unexpected error in stream_chat: {e}")
+                    break
+
+        # If retries are exhausted, yield a meaningful fallback chunk so callers can display something
+        logger.error("Exceeded maximum retries for chat stream request.")
+        yield "The system is currently overloaded. Please try again later."
+
+    @classmethod
+    def set_global_stream_callback(cls, cb):
+        cls._global_stream_callback = cb
+
+    @classmethod
+    def clear_global_stream_callback(cls):
+        cls._global_stream_callback = None
+
+    def set_instance_stream_callback(self, cb):
+        """Set per-instance stream callback to avoid cross-user contamination."""
+        self._instance_stream_callback = cb
+
+    def clear_instance_stream_callback(self):
+        """Clear per-instance stream callback."""
+        self._instance_stream_callback = None
+
+    def try_stream_fallback_chat(self, system_prompt: str, user_prompt: str) -> str:
+        """Try streaming the response (if a global stream callback is set) and fall back to the blocking chat call.
+
+        Returns the final aggregated text.
+        """
+        try:
+            if self._instance_stream_callback or getattr(LLMSession, '_global_stream_callback', None):
+                logger.info('LLMSession: streaming via try_stream_fallback_chat')
+                last = ''
+                for snapshot in self.stream_chat(system_prompt, user_prompt):
+                    if snapshot:
+                        last = snapshot
+                return last
+            else:
+                logger.info('LLMSession: non-streaming via try_stream_fallback_chat')
+                return self.chat(system_prompt, user_prompt)
+        except Exception as e:
+            logger.error(f"try_stream_fallback_chat failed: {e}. Falling back to non-streaming chat.")
+            return self.chat(system_prompt, user_prompt)
\ No newline at end of file
diff --git a/src/copilot-chat/src/copilot_agent/utils/logger.py b/src/copilot-chat/src/copilot_agent/utils/logger.py
index 48ca6f6a..eda1841e 100644
--- a/src/copilot-chat/src/copilot_agent/utils/logger.py
+++ b/src/copilot-chat/src/copilot_agent/utils/logger.py
@@ -53,12 +53,24 @@ def setup_logging():
 # Expose the root logger so all modules use the same logger instance
 class SimpleLogger:
     def info(self, msg):
-        print(f"{Fore.GREEN}[INFO]{Style.RESET_ALL} {msg}")
+        try:
+            print(f"{Fore.GREEN}[INFO]{Style.RESET_ALL} {msg}", flush=True)
+        except (BrokenPipeError, OSError):
+            logging.getLogger().info(msg)
     def error(self, msg):
-        print(f"{Fore.RED}[ERROR]{Style.RESET_ALL} {msg}")
+        try:
+            print(f"{Fore.RED}[ERROR]{Style.RESET_ALL} {msg}", flush=True)
+        except (BrokenPipeError, OSError):
+            logging.getLogger().error(msg)
     def debug(self, msg):
-        print(f"{Fore.CYAN}[DEBUG]{Style.RESET_ALL} {msg}")
+        try:
+            print(f"{Fore.CYAN}[DEBUG]{Style.RESET_ALL} {msg}", flush=True)
+        except (BrokenPipeError, OSError):
+            logging.getLogger().debug(msg)
     def warning(self, msg):
-        print(f"{Fore.YELLOW}[WARNING]{Style.RESET_ALL} {msg}")
+        try:
+            print(f"{Fore.YELLOW}[WARNING]{Style.RESET_ALL} {msg}", flush=True)
+        except (BrokenPipeError, OSError):
+            logging.getLogger().warning(msg)
 
 logger = SimpleLogger()
diff --git a/src/copilot-chat/src/copilot_agent/utils/push_frontend.py b/src/copilot-chat/src/copilot_agent/utils/push_frontend.py
new file mode 100644
index 00000000..c2788a58
--- /dev/null
+++ b/src/copilot-chat/src/copilot_agent/utils/push_frontend.py
@@ -0,0 +1,54 @@
+import json
+import threading
+
+from ..utils.logger import logger
+from ..utils.llmsession import LLMSession
+
+# Thread-local storage to track the current LLM session for this request
+_thread_local = threading.local()
+
+def set_thread_llm_session(llm_session):
+    """Set the LLM session for the current thread (for streaming context)."""
+    _thread_local.llm_session = llm_session
+
+def get_thread_llm_session():
+    """Get the LLM session for the current thread."""
+    return getattr(_thread_local, 'llm_session', None)
+
+def push_frontend_event(content: str, replace: bool = False):
+    """Push an event to the frontend."""
+    # Try to use thread-local LLM session first (for per-user streaming)
+    # Fall back to global callback for backward compatibility
+    try:
+        cb = None
+        thread_session = get_thread_llm_session()
+        if thread_session and hasattr(thread_session, '_instance_stream_callback'):
+            cb = thread_session._instance_stream_callback
+        
+        if not cb:
+            cb = LLMSession._global_stream_callback
+            
+        if cb:
+            if replace:
+                cb(content)
+            else:
+                cb(json.dumps({"type": "append", "text": content}))
+    except Exception as e:
+        logger.debug(f"Failed to stream appended content: {e}")
+
+
+def push_frontend_meta(message_info: dict):
+    """Push a metadata event (messageInfo) to the frontend so client can attach turnId before answer arrives."""
+    try:
+        cb = None
+        thread_session = get_thread_llm_session()
+        if thread_session and hasattr(thread_session, '_instance_stream_callback'):
+            cb = thread_session._instance_stream_callback
+        
+        if not cb:
+            cb = LLMSession._global_stream_callback
+            
+        if cb:
+            cb(json.dumps({"type": "meta", "messageInfo": message_info}))
+    except Exception as e:
+        logger.debug(f"Failed to stream meta event: {e}")
\ No newline at end of file
diff --git a/src/copilot-chat/src/copilot_agent/utils/query.py b/src/copilot-chat/src/copilot_agent/utils/query.py
index 44cc25e2..db1ba6fa 100644
--- a/src/copilot-chat/src/copilot_agent/utils/query.py
+++ b/src/copilot-chat/src/copilot_agent/utils/query.py
@@ -17,35 +17,33 @@
 from ..utils.time import get_current_unix_timestamp
 from ..utils.utils import extract_json_dict, get_prompt_from
 
-model = LLMSession()
 
-
-def gen_kusto_query_pseudo(SUB_FEATURE: str, gen_prompt_file: str, user_prompt: str) -> dict:
+def gen_kusto_query_pseudo(SUB_FEATURE: str, gen_prompt_file: str, user_prompt: str, llm_session: LLMSession) -> dict:
     """Generate controller input."""
     sys_prompt = get_prompt_from(os.path.join(PROMPT_DIR, SUB_FEATURE, gen_prompt_file))
-    resp = model.chat(sys_prompt, user_prompt)
+    resp = llm_session.chat(sys_prompt, user_prompt)
     controller_input = extract_json_dict(benchmark=resp, nested=False)
     return controller_input
 
 
-def gen_kusto_query_fallback_pseudo(question, SUB_FEATURE, gen_fallback_prompt) -> str:
+def gen_kusto_query_fallback_pseudo(question, SUB_FEATURE, gen_fallback_prompt, llm_session: LLMSession) -> str:
     """Generate fallback query from RCA to status."""
     logger.info('Generate a fall back status query')
     system_prompt = get_prompt_from(os.path.join(PROMPT_DIR, SUB_FEATURE, gen_fallback_prompt))
     user_prompt = f'<user input> is\n{question}'
-    resp = model.chat(system_prompt, user_prompt)
+    resp = llm_session.chat(system_prompt, user_prompt)
     fall_back_question = resp.replace('```', '')
     return fall_back_question
 
 
-def gen_sql_query(SUB_FEATURE: str, database: SQLManager, question: str) -> str:
+def gen_sql_query(SUB_FEATURE: str, database: SQLManager, question: str, llm_session: LLMSession) -> str:
     """Generate a general SQL query."""
     logger.info('Generate a SQL query')
     generation_prompt = get_prompt_from(os.path.join(PROMPT_DIR, SUB_FEATURE, 'gen_query_sql_general.txt'))
     accepeted_value_prompt = database.get_unique_values()
     system_prompt = generation_prompt + f'<accepted values> are\n{accepeted_value_prompt}'
     user_prompt = f'<user input> is\n{question}'
-    resp = model.chat(system_prompt, user_prompt)
+    resp = llm_session.chat(system_prompt, user_prompt)
     logger.info(f'resp {resp}')
     matches = re.findall(r'[`\']{3}(.*?)[`\']{3}', resp, re.DOTALL)
     if not matches:
@@ -136,7 +134,7 @@ def _get_promql_param():
     return param
 
 
-def _gen_promql_query_param(SUB_FEATURE: str, question: str) -> dict:
+def _gen_promql_query_param(SUB_FEATURE: str, question: str, llm_session: LLMSession) -> dict:
     """Generate a general PromQL query."""
     logger.info('Generate a PromQL query')
     generation_prompt = get_prompt_from(os.path.join(PROMPT_DIR, SUB_FEATURE, 'gen_query_promql_metrics.txt'))
@@ -144,16 +142,16 @@ def _gen_promql_query_param(SUB_FEATURE: str, question: str) -> dict:
     # logger.info(f'accepeted_value_prompt:\n{accepeted_value_prompt}')
     system_prompt = generation_prompt + f'<accepted values> are\n{accepeted_value_prompt}'
     user_prompt = f'<user input> is\n{question}'
-    resp = model.chat(system_prompt, user_prompt)
+    resp = llm_session.chat(system_prompt, user_prompt)
     logger.info(f'resp:\n{resp}')
     params = extract_json_dict(benchmark=resp, nested=False)
     logger.info(f'params:\n{params}')
     return params
 
 
-def gen_promql_query(SUB_FEATURE: str, question: str) -> tuple[str, bool]:
+def gen_promql_query(SUB_FEATURE: str, question: str, llm_session: LLMSession) -> tuple[str, bool]:
     """Generate a general PromQL query."""
-    params = _gen_promql_query_param(SUB_FEATURE, question)
+    params = _gen_promql_query_param(SUB_FEATURE, question, llm_session)
     if not isinstance(params, dict):
         logger.info(f'No query found in the response, params is {params}')
         return None, None, None, None
diff --git a/src/copilot-chat/src/copilot_agent/utils/smart_help.py b/src/copilot-chat/src/copilot_agent/utils/smart_help.py
index 9023b9fe..157496c2 100644
--- a/src/copilot-chat/src/copilot_agent/utils/smart_help.py
+++ b/src/copilot-chat/src/copilot_agent/utils/smart_help.py
@@ -9,44 +9,49 @@
 from ..utils.llmsession import LLMSession
 from ..utils.utils import get_prompt_from
 
-model = LLMSession()
 
-
-# generate help message for CoPilot agent
-def gen_smart_help(help_msg, user_question: str, key_lst: list, SMART_HELP=True) -> str:
-    """Generate smart help message for CoPilot agent."""
-    # dump help method
-    dump_help = ''
-    if isinstance(help_msg, dict):
-        for key in key_lst:
-            if key in help_msg:
-                dump_help += help_msg[key]
-                dump_help += '\n\n'
-    # version
-    _version = 'f0f1'
-    if COPILOT_VERSION == 'f2':
-        _version = 'f2'
-    # capability
-    if _version == 'f2':
-        capability_str = help_msg['feature'] + help_msg['sku'] + help_msg['workload']
-    elif _version == 'f0f1':
-        capability_str = help_msg['feature']
-    else:
-        capability_str = help_msg['feature']
-
-    sys_prompt = get_prompt_from(os.path.join(PROMPT_DIR, 'gen_smart_help_prompt.txt'))
+class SmartHelp:
+    """Smart help generator for CoPilot agent."""
     
-
-    if SMART_HELP:
-        # smart help method
-        capability_promp = f'[features]\n {capability_str} \n\n'
-        question_prompt = f'[user question]\n {user_question} \n\n'
-        user_prompt = question_prompt + f'[reason to generate the help]\n str{key_lst} \n\n' + capability_promp
-        # send to a LLM session to generate a smart help
-        smart_help = model.chat(sys_prompt, user_prompt)
-        final_help = smart_help
-    else:
-        dump_help_prompt = f'[reason to generate the help]\n {dump_help} \n\n'
-        final_help = model.chat(sys_prompt, dump_help_prompt)
-
-    return final_help
+    def __init__(self, help_msg: dict, llm_session: LLMSession):
+        """Initialize with cached prompts."""
+        self.help_msg = help_msg
+        self.llm_session = llm_session
+        self._version = 'f0f1' if COPILOT_VERSION != 'f2' else 'f2'
+        
+        # Load prompt once during initialization
+        self.sys_prompt = get_prompt_from(
+            os.path.join(PROMPT_DIR, 'gen_smart_help_prompt.txt')
+        )
+        
+        # Prepare capability string based on version
+        if self._version == 'f2':
+            self.capability_str = (help_msg['feature'] + 
+                                  help_msg['sku'] + 
+                                  help_msg['workload'])
+        else:
+            self.capability_str = help_msg['feature']
+    
+    def generate(self, user_question: str, key_lst: list, 
+                 smart_help: bool = True) -> str:
+        """Generate smart help message."""
+        # Build dump_help from keys
+        dump_help = '\n\n'.join(
+            self.help_msg[key] for key in key_lst 
+            if key in self.help_msg
+        )
+        
+        if smart_help:
+            capability_prompt = f'[features]\n {self.capability_str} \n\n'
+            question_prompt = f'[user question]\n {user_question} \n\n'
+            user_prompt = (question_prompt + 
+                          f'[reason to generate the help]\n {key_lst} \n\n' + 
+                          capability_prompt)
+            return self.llm_session.try_stream_fallback_chat(
+                self.sys_prompt, user_prompt
+            )
+        else:
+            dump_help_prompt = f'[reason to generate the help]\n {dump_help} \n\n'
+            return self.llm_session.try_stream_fallback_chat(
+                self.sys_prompt, dump_help_prompt
+            )
\ No newline at end of file
diff --git a/src/copilot-chat/src/copilot_agent/utils/summary.py b/src/copilot-chat/src/copilot_agent/utils/summary.py
index a994fb1b..1583ef43 100644
--- a/src/copilot-chat/src/copilot_agent/utils/summary.py
+++ b/src/copilot-chat/src/copilot_agent/utils/summary.py
@@ -11,13 +11,11 @@
 
 from ..config import TOKEN_LIMIT, PROMPT_DIR
 from ..utils.llmsession import LLMSession
-from ..utils.smart_help import gen_smart_help
 from ..utils.utils import get_prompt_from
 
-model = LLMSession()
 
 def gen_summary(
-    SUB_FEATURE, resp_total, resp_brief, question, gen_prompt_file, knowledge_prompt_file, help_msg, skip_summary=False
+    SUB_FEATURE, resp_total, resp_brief, question, gen_prompt_file, knowledge_prompt_file, help_msg, skip_summary=False, llm_session=None
 ):
     """Generate a summary."""
     logger.info('Generating Response Report')
@@ -44,20 +42,28 @@ def gen_summary(
         sys_prompt = gen_sum_prompt + '\n\n' + knowledge_prmpt + '\n\n'
         if not skip_summary:
             logger.info('Bypass summary: False')
-            summary = model.chat(sys_prompt, user_prompt)
+            # try stream chat, if fail, fall back to chat
+            if llm_session is None:
+                raise ValueError("llm_session is required when skip_summary is False")
+            summary = llm_session.try_stream_fallback_chat(sys_prompt, user_prompt)
         else:
             logger.info('Bypass summary: True')
             summary = handle_bypass_summary(resp_total, resp_brief)
     else:
         logger.info('generating smart help')
         help_keys = ['corrupted_data']
-        summary = help_keys[0]
-        summary = gen_smart_help(help_msg, question, help_keys)
+        sys_prompt = help_msg[help_keys[0]] if help_keys[0] in help_msg else help_keys[0]
+        if llm_session is None:
+            raise ValueError("llm_session is required for smart help generation")
+        summary = llm_session.try_stream_fallback_chat(sys_prompt, question)
     return summary
 
 
 def handle_bypass_summary(resp_total, resp_brief):
-    """Handle bypass summary logic."""
+    """Handle bypass summary logic.
+
+    Returns full data if under TOKEN_LIMIT else brief data.
+    """
     if len(str(resp_total)) < TOKEN_LIMIT:
         logger.info('Outputing full data')
         return resp_total
diff --git a/src/pylon/deploy/pylon-config/location.conf.template b/src/pylon/deploy/pylon-config/location.conf.template
index 02b9e940..71606cb6 100644
--- a/src/pylon/deploy/pylon-config/location.conf.template
+++ b/src/pylon/deploy/pylon-config/location.conf.template
@@ -63,6 +63,20 @@ location ~ ^/copilot/api/operation(.*)$ {
   proxy_send_timeout 2m;
 }
 
+location ~ ^/copilot/api/stream(.*)$ {
+  proxy_pass {{COPILOT_CHAT_URI}}/copilot/api/stream$1$is_args$args;
+  proxy_connect_timeout 2m;
+  proxy_read_timeout 2m;
+  proxy_send_timeout 2m;
+  
+  # Required for SSE (Server-Sent Events) streaming
+  proxy_buffering off;
+  proxy_cache off;
+  proxy_set_header Connection '';
+  proxy_http_version 1.1;
+  chunked_transfer_encoding off;
+}
+
 # Model proxy backend
 location ~ ^/model-proxy/(.*)$ {
   proxy_pass {{MODEL_PROXY_URI}}/$1$is_args$args;