support glm4 for hf engine

xusenlin · xusenlin · commit f75a360f53ef · 2024-06-06T12:02:08.000+08:00
diff --git a/api/config.py b/api/config.py
@@ -130,12 +130,6 @@ class LLMSettings(BaseModel):
         description="Use flash attention."
     )
 
-    # support for transformers.TextIteratorStreamer
-    use_streamer_v2: Optional[bool] = Field(
-        default=get_bool_env("USE_STREAMER_V2", "true"),
-        description="Support for transformers.TextIteratorStreamer."
-    )
-
     interrupt_requests: Optional[bool] = Field(
         default=get_bool_env("INTERRUPT_REQUESTS", "true"),
         description="Whether to interrupt requests when a new request is received.",
diff --git a/api/core/default.py b/api/core/default.py
@@ -42,7 +42,7 @@
     generate_stream_chatglm_v3,
     build_qwen_chat_input,
     check_is_qwen,
-    generate_stream,
+    generate_stream_v2,
     build_xverse_chat_input,
     check_is_xverse,
 )
@@ -65,7 +65,6 @@ def __init__(
         model_name: str,
         context_len: Optional[int] = None,
         prompt_name: Optional[str] = None,
-        use_streamer_v2: Optional[bool] = False,
     ) -> None:
         """
         Initialize the Default class.
@@ -76,7 +75,6 @@ def __init__(
             model_name (str): The name of the model.
             context_len (Optional[int], optional): The length of the context. Defaults to None.
             prompt_name (Optional[str], optional): The name of the prompt. Defaults to None.
-            use_streamer_v2 (Optional[bool], optional): Whether to use Streamer V2. Defaults to False.
         """
         self.model = model
         self.tokenizer = tokenizer
@@ -85,7 +83,6 @@ def __init__(
         self.model_name = model_name.lower()
         self.prompt_name = prompt_name.lower() if prompt_name is not None else None
         self.context_len = context_len
-        self.use_streamer_v2 = use_streamer_v2
 
         self.prompt_adapter = get_prompt_adapter(self.model_name, prompt_name=self.prompt_name)
 
@@ -101,10 +98,11 @@ def _prepare_for_generate(self) -> None:
         3. Checks and constructs the prompt.
         4. Sets the context length if it is not already set.
         """
-        self.generate_stream_func = generate_stream
+        self.generate_stream_func = generate_stream_v2
         if "chatglm3" in self.model_name:
             self.generate_stream_func = generate_stream_chatglm_v3
-            self.use_streamer_v2 = False
+        elif "chatglm4" in self.model_name:
+            self.generate_stream_func = generate_stream_v2
         elif check_is_chatglm(self.model):
             self.generate_stream_func = generate_stream_chatglm
         elif check_is_qwen(self.model):
@@ -118,7 +116,10 @@ def _prepare_for_generate(self) -> None:
     def _check_construct_prompt(self) -> None:
         """ Check whether to need to construct prompts or inputs. """
         self.construct_prompt = self.prompt_name is not None
-        if "chatglm3" in self.model_name:
+        if "chatglm4" in self.model_name:
+            self.construct_prompt = False
+            logger.info("Using ChatGLM4 Model for Chat!")
+        elif "chatglm3" in self.model_name:
             logger.info("Using ChatGLM3 Model for Chat!")
         elif check_is_baichuan(self.model):
             logger.info("Using Baichuan Model for Chat!")
@@ -246,6 +247,12 @@ def build_chat_inputs(
         if "chatglm3" in self.model_name:
             query, role = messages[-1]["content"], messages[-1]["role"]
             inputs = self.tokenizer.build_chat_input(query, history=messages[:-1], role=role)
+        elif "chatglm4" in self.model_name:
+            inputs = self.tokenizer.apply_chat_template(
+                messages,
+                add_generation_prompt=True,
+                tokenize=True,
+            )[0]
         elif check_is_baichuan(self.model):
             inputs = build_baichuan_chat_input(
                 self.tokenizer, messages, self.context_len, max_new_tokens
diff --git a/api/models.py b/api/models.py
@@ -86,7 +86,6 @@ def create_hf_llm():
         model_name=SETTINGS.model_name,
         context_len=SETTINGS.context_length if SETTINGS.context_length > 0 else None,
         prompt_name=SETTINGS.chat_template,
-        use_streamer_v2=SETTINGS.use_streamer_v2,
     )
 
 
diff --git a/examples/chatglm3/tool_using.py b/examples/chatglm3/tool_using.py
@@ -11,7 +11,7 @@
 
 client = OpenAI(
     api_key="EMPTY",
-    base_url="http://192.168.20.59:7891/v1/",
+    base_url="http://192.168.0.59:7860/v1/",
 )
 
 functions = list(get_tools().values())

Original file line number	Diff line number	Diff line change
`@@ -86,7 +86,6 @@ def create_hf_llm():`
`86`	`86`	`model_name=SETTINGS.model_name,`
`87`	`87`	`context_len=SETTINGS.context_length if SETTINGS.context_length > 0 else None,`
`88`	`88`	`prompt_name=SETTINGS.chat_template,`
`89`		`- use_streamer_v2=SETTINGS.use_streamer_v2,`
`90`	`89`	`)`
`91`	`90`
`92`	`91`
Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`
`12`	`12`	`client = OpenAI(`
`13`	`13`	`api_key="EMPTY",`
`14`		`- base_url="http://192.168.20.59:7891/v1/",`
	`14`	`+ base_url="http://192.168.0.59:7860/v1/",`
`15`	`15`	`)`
`16`	`16`
`17`	`17`	`functions = list(get_tools().values())`