jd-opensource
diff --git a/‎xllm/api_service/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎xllm/api_service/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎xllm/api_service/api_service.cpp‎
Lines changed: 71 additions & 48 deletions b/‎xllm/api_service/api_service.cpp‎
Lines changed: 71 additions & 48 deletions
diff --git a/‎xllm/api_service/api_service.h‎
Lines changed: 1 addition & 0 deletions b/‎xllm/api_service/api_service.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎xllm/api_service/completion_service_impl.cpp‎
Lines changed: 9 additions & 5 deletions b/‎xllm/api_service/completion_service_impl.cpp‎
Lines changed: 9 additions & 5 deletions
@@ -8,6 +8,7 @@ cc_library(
     api_service_impl.h
     call.h
     completion_service_impl.h
+    rec_completion_service_impl.h
     chat_service_impl.h
     embedding_service_impl.h
     image_generation_service_impl.h
@@ -23,6 +24,7 @@ cc_library(
     api_service.cpp
     call.cpp
     completion_service_impl.cpp
+    rec_completion_service_impl.cpp
     chat_service_impl.cpp
     embedding_service_impl.cpp
     image_generation_service_impl.cpp
 
@@ -27,11 +27,14 @@ limitations under the License.
 #include "core/common/metrics.h"
 #include "core/runtime/dit_master.h"
 #include "core/runtime/llm_master.h"
+// TODO. add following when next pr.
+// #include "core/runtime/rec_master.h"
 #include "core/runtime/vlm_master.h"
 #include "core/util/closure_guard.h"
 #include "embedding.pb.h"
 #include "image_generation.pb.h"
 #include "models.pb.h"
+#include "rec_completion_service_impl.h"
 #include "service_impl_factory.h"
 #include "xllm_metrics.h"
 namespace xllm {
@@ -70,6 +73,11 @@ APIService::APIService(Master* master,
     image_generation_service_impl_ =
         std::make_unique<ImageGenerationServiceImpl>(
             dynamic_cast<DiTMaster*>(master), model_names);
+  } else if (FLAGS_backend == "rec") {
+    // TODO. delete this when next pr.
+    using RecMaster = LLMMaster;
+    rec_completion_service_impl_ = std::make_unique<RecCompletionServiceImpl>(
+        dynamic_cast<RecMaster*>(master), model_names);
   }
   models_service_impl_ =
       ServiceImplFactory<ModelsServiceImpl>::create_service_impl(
@@ -80,13 +88,6 @@ void APIService::Completions(::google::protobuf::RpcController* controller,
                              const proto::CompletionRequest* request,
                              proto::CompletionResponse* response,
                              ::google::protobuf::Closure* done) {
-  // TODO with xllm-service
-}
-
-void APIService::CompletionsHttp(::google::protobuf::RpcController* controller,
-                                 const proto::HttpRequest* request,
-                                 proto::HttpResponse* response,
-                                 ::google::protobuf::Closure* done) {
   xllm::ClosureGuard done_guard(
       done,
       std::bind(request_in_metric, nullptr),
@@ -95,66 +96,89 @@ void APIService::CompletionsHttp(::google::protobuf::RpcController* controller,
     LOG(ERROR) << "brpc request | respose | controller is null";
     return;
   }
+  auto ctrl = reinterpret_cast<brpc::Controller*>(controller);
 
-  auto arena = response->GetArena();
+  if (FLAGS_backend == "llm" || FLAGS_backend == "vlm") {
+    CHECK(completion_service_impl_) << " completion service is invalid.";
+    std::shared_ptr<Call> call = std::make_shared<CompletionCall>(
+        ctrl,
+        done_guard.release(),
+        const_cast<proto::CompletionRequest*>(request),
+        response);
+    completion_service_impl_->process_async(call);
+  } else if (FLAGS_backend == "rec") {
+    CHECK(rec_completion_service_impl_)
+        << " rec completion service is invalid.";
+    std::shared_ptr<Call> call = std::make_shared<CompletionCall>(
+        ctrl,
+        done_guard.release(),
+        const_cast<proto::CompletionRequest*>(request),
+        response);
+    rec_completion_service_impl_->process_async(call);
+  }
+}
+
+namespace {
+template <typename Call, typename Service>
+void CommonCompletionsImpl(std::unique_ptr<Service>& service,
+                           xllm::ClosureGuard& guard,
+                           ::google::protobuf::Arena* arena,
+                           brpc::Controller* ctrl) {
   auto req_pb =
-      google::protobuf::Arena::CreateMessage<proto::CompletionRequest>(arena);
+      google::protobuf::Arena::CreateMessage<typename Call::ReqType>(arena);
   auto resp_pb =
-      google::protobuf::Arena::CreateMessage<proto::CompletionResponse>(arena);
+      google::protobuf::Arena::CreateMessage<typename Call::ResType>(arena);
 
-  auto ctrl = reinterpret_cast<brpc::Controller*>(controller);
   std::string error;
   json2pb::Json2PbOptions options;
   butil::IOBuf& buf = ctrl->request_attachment();
   butil::IOBufAsZeroCopyInputStream iobuf_stream(buf);
   auto st = json2pb::JsonToProtoMessage(&iobuf_stream, req_pb, options, &error);
   if (!st) {
     ctrl->SetFailed(error);
-    LOG(ERROR) << "parse json to proto failed: " << error;
+    LOG(ERROR) << "parse json to proto failed: " << buf.to_string();
     return;
   }
 
-  std::shared_ptr<Call> call = std::make_shared<CompletionCall>(
-      ctrl, done_guard.release(), req_pb, resp_pb);
-  completion_service_impl_->process_async(call);
+  auto call = std::make_shared<Call>(ctrl, guard.release(), req_pb, resp_pb);
+  service->process_async(call);
 }
+}  // namespace
 
-void APIService::ChatCompletions(::google::protobuf::RpcController* controller,
-                                 const proto::ChatRequest* request,
-                                 proto::ChatResponse* response,
+void APIService::CompletionsHttp(::google::protobuf::RpcController* controller,
+                                 const proto::HttpRequest* request,
+                                 proto::HttpResponse* response,
                                  ::google::protobuf::Closure* done) {
-  // TODO with xllm-service
-}
-
-namespace {
-template <typename ChatCall, typename Service>
-void ChatCompletionsImpl(std::unique_ptr<Service>& service,
-                         xllm::ClosureGuard& guard,
-                         ::google::protobuf::Arena* arena,
-                         brpc::Controller* ctrl) {
-  auto req_pb =
-      google::protobuf::Arena::CreateMessage<typename ChatCall::ReqType>(arena);
-  auto resp_pb =
-      google::protobuf::Arena::CreateMessage<typename ChatCall::ResType>(arena);
+  xllm::ClosureGuard done_guard(
+      done,
+      std::bind(request_in_metric, nullptr),
+      std::bind(request_out_metric, (void*)controller));
+  if (!request || !response || !controller) {
+    LOG(ERROR) << "brpc request | respose | controller is null";
+    return;
+  }
 
-  std::string attachment = std::move(ctrl->request_attachment().to_string());
-  std::string error;
+  auto arena = response->GetArena();
+  auto ctrl = reinterpret_cast<brpc::Controller*>(controller);
 
-  google::protobuf::util::JsonParseOptions options;
-  options.ignore_unknown_fields = true;
-  auto json_status =
-      google::protobuf::util::JsonStringToMessage(attachment, req_pb, options);
-  if (!json_status.ok()) {
-    ctrl->SetFailed(json_status.ToString());
-    LOG(ERROR) << "parse json to proto failed: " << json_status.ToString();
-    return;
+  if (FLAGS_backend == "llm" || FLAGS_backend == "vlm") {
+    CHECK(completion_service_impl_) << " completion service is invalid.";
+    CommonCompletionsImpl<CompletionCall, CompletionServiceImpl>(
+        completion_service_impl_, done_guard, arena, ctrl);
+  } else if (FLAGS_backend == "rec") {
+    CHECK(rec_completion_service_impl_)
+        << " rec completion service is invalid.";
+    CommonCompletionsImpl<CompletionCall, RecCompletionServiceImpl>(
+        rec_completion_service_impl_, done_guard, arena, ctrl);
   }
+}
 
-  auto call = std::make_shared<ChatCall>(
-      ctrl, guard.release(), req_pb, resp_pb, arena != nullptr /*use_arena*/);
-  service->process_async(call);
+void APIService::ChatCompletions(::google::protobuf::RpcController* controller,
+                                 const proto::ChatRequest* request,
+                                 proto::ChatResponse* response,
+                                 ::google::protobuf::Closure* done) {
+  // TODO with xllm-service
 }
-}  // namespace
 
 void APIService::ChatCompletionsHttp(
     ::google::protobuf::RpcController* controller,
@@ -175,12 +199,11 @@ void APIService::ChatCompletionsHttp(
   if (FLAGS_backend == "llm") {
     auto arena = response->GetArena();
     CHECK(chat_service_impl_) << " chat service is invalid.";
-    ChatCompletionsImpl<ChatCall, ChatServiceImpl>(
+    CommonCompletionsImpl<ChatCall, ChatServiceImpl>(
         chat_service_impl_, done_guard, arena, ctrl);
   } else if (FLAGS_backend == "vlm") {
     CHECK(mm_chat_service_impl_) << " mm chat service is invalid.";
-    // TODO: fix me - temporarily using heap allocation instead of arena
-    ChatCompletionsImpl<MMChatCall, MMChatServiceImpl>(
+    CommonCompletionsImpl<MMChatCall, MMChatServiceImpl>(
         mm_chat_service_impl_, done_guard, nullptr, ctrl);
   }
 }
 
@@ -124,6 +124,7 @@ class APIService : public proto::XllmAPIService {
   std::unique_ptr<ModelsServiceImpl> models_service_impl_;
   std::unique_ptr<ImageGenerationServiceImpl> image_generation_service_impl_;
   std::unique_ptr<RerankServiceImpl> rerank_service_impl_;
+  std::unique_ptr<RecCompletionServiceImpl> rec_completion_service_impl_;
 };
 
 }  // namespace xllm
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "common/instance_name.h"
 #include "completion.pb.h"
+#include "core/framework/request/mm_data.h"
 #include "core/framework/request/request_output.h"
 #include "core/runtime/llm_master.h"
 #include "core/util/utils.h"
@@ -126,6 +127,7 @@ bool send_result_to_client_brpc(std::shared_ptr<CompletionCall> call,
   response.set_created(created_time);
   response.set_model(model);
 
+  // add choices into response
   response.mutable_choices()->Reserve(req_output.outputs.size());
   for (const auto& output : req_output.outputs) {
     auto* choice = response.add_choices();
@@ -137,6 +139,7 @@ bool send_result_to_client_brpc(std::shared_ptr<CompletionCall> call,
     }
   }
 
+  // add usage statistics
   if (req_output.usage.has_value()) {
     const auto& usage = req_output.usage.value();
     auto* proto_usage = response.mutable_usage();
@@ -163,6 +166,7 @@ CompletionServiceImpl::CompletionServiceImpl(
 void CompletionServiceImpl::process_async_impl(
     std::shared_ptr<CompletionCall> call) {
   const auto& rpc_request = call->request();
+
   // check if model is supported
   const auto& model = rpc_request.model();
   if (unlikely(!models_.contains(model))) {
@@ -196,20 +200,20 @@ void CompletionServiceImpl::process_async_impl(
     request_params.decode_address = rpc_request.routing().decode_name();
   }
 
+  // schedule the request
   auto saved_streaming = request_params.streaming;
   auto saved_request_id = request_params.request_id;
-  // schedule the request
   master_->handle_request(
-      std::move(rpc_request.prompt()),
+      std::move(call->request().prompt()),
       std::move(prompt_tokens),
       std::move(request_params),
       call.get(),
       [call,
        model,
        master = master_,
-       stream = std::move(saved_streaming),
-       include_usage = include_usage,
-       request_id = std::move(saved_request_id),
+       stream = saved_streaming,
+       include_usage,
+       request_id = saved_request_id,
        created_time = absl::ToUnixSeconds(absl::Now())](
           const RequestOutput& req_output) -> bool {
         if (req_output.status.has_value()) {