Changed the embedding model to be mpnet-base-v2

Yuval-Roth · olgaoznovich · Yuval-Roth · commit b474d156df12 · 2025-06-20T13:07:05.000+03:00
added flags for its usage and updated milvus to use cosine instead of L2 for searching

Co-authored-by: olgaoznovich &lt;ol.oznovich@gmail.com&gt;
Co-authored-by: Yuval-Roth &lt;rothyuv@post.bgu.ac.il&gt;
diff --git a/flask4modelcache.py b/flask4modelcache.py
@@ -6,6 +6,8 @@
 import json
 from modelcache import cache
 from modelcache.adapter import adapter
+from modelcache.embedding.mpnet_base import MPNet_Base
+from modelcache.manager.vector_data import manager
 from modelcache.manager import CacheBase, VectorBase, get_data_manager, data_manager
 from modelcache.similarity_evaluation.distance import SearchDistanceEvaluation
 from modelcache.processor.pre import query_multi_splicing
@@ -30,9 +32,17 @@ def save_query_info(result, model, query, delta_time_log):
 def response_hitquery(cache_resp):
     return cache_resp['hitQuery']
 
-data2vec = Data2VecAudio()
-embedding_func = data2vec.to_embeddings
-dimension = data2vec.dimension
+manager.MPNet_base = True
+
+if manager.MPNet_base:
+    mpnet_base = MPNet_Base()
+    embedding_func = lambda x: mpnet_base.embedding_func(x)
+    dimension =  mpnet_base.dimension
+    data_manager.NORMALIZE = False
+else:
+    data2vec = Data2VecAudio()
+    embedding_func = data2vec.to_embeddings
+    dimension = data2vec.dimension
 
 mysql_config = configparser.ConfigParser()
 mysql_config.read('modelcache/config/mysql_config.ini')
@@ -49,8 +59,30 @@ def response_hitquery(cache_resp):
 # chromadb_config = configparser.ConfigParser()
 # chromadb_config.read('modelcache/config/chromadb_config.ini')
 
-data_manager = get_data_manager(CacheBase("mysql", config=mysql_config),
-                                VectorBase("milvus", dimension=dimension, milvus_config=milvus_config))
+data_manager = get_data_manager(
+    CacheBase("mysql", config=mysql_config),
+    VectorBase("milvus",
+               dimension=dimension,
+               milvus_config=milvus_config,
+               index_params={
+                   "metric_type": "COSINE",
+                   "index_type": "HNSW",
+                   "params": {"M": 16, "efConstruction": 64},
+                } if manager.MPNet_base else None,
+                search_params={
+                    "IVF_FLAT": {"metric_type": "COSINE", "params": {"nprobe": 10}},
+                    "IVF_SQ8": {"metric_type": "COSINE", "params": {"nprobe": 10}},
+                    "IVF_PQ": {"metric_type": "COSINE", "params": {"nprobe": 10}},
+                    "HNSW": {"metric_type": "COSINE", "params": {"ef": 10}},
+                    "RHNSW_FLAT": {"metric_type": "COSINE", "params": {"ef": 10}},
+                    "RHNSW_SQ": {"metric_type": "COSINE", "params": {"ef": 10}},
+                    "RHNSW_PQ": {"metric_type": "COSINE", "params": {"ef": 10}},
+                    "IVF_HNSW": {"metric_type": "COSINE", "params": {"nprobe": 10, "ef": 10}},
+                    "ANNOY": {"metric_type": "COSINE", "params": {"search_k": 10}},
+                    "AUTOINDEX": {"metric_type": "COSINE", "params": {}},
+                } if manager.MPNet_base else None
+    )
+)
 
 
 # data_manager = get_data_manager(CacheBase("mysql", config=mysql_config),
diff --git a/model/download_bert_embedder.bat b/model/download_bert_embedder.bat
diff --git a/modelcache/adapter/adapter_query.py b/modelcache/adapter/adapter_query.py
@@ -1,11 +1,13 @@
 # -*- coding: utf-8 -*-
 import logging
 import time
+
 from modelcache import cache
 from modelcache.utils.error import NotInitError
 from modelcache.utils.time import time_cal
 from modelcache.processor.pre import multi_analysis
 from FlagEmbedding import FlagReranker
+from modelcache.manager.vector_data import manager
 
 USE_RERANKER = False  # 如果为 True 则启用 reranker，否则使用原有逻辑
 
@@ -44,39 +46,47 @@ def adapt_query(cache_data_convert, *args, **kwargs):
         cache_answers = []
         cache_questions = []
         cache_ids = []
-        similarity_threshold = chat_cache.config.similarity_threshold
-        similarity_threshold_long = chat_cache.config.similarity_threshold_long
+        cosine_similarity = cache_data_list[0][0]
 
-        min_rank, max_rank = chat_cache.similarity_evaluation.range()
-        rank_threshold = (max_rank - min_rank) * similarity_threshold * cache_factor
-        rank_threshold_long = (max_rank - min_rank) * similarity_threshold_long * cache_factor
-        rank_threshold = (
-            max_rank
-            if rank_threshold > max_rank
-            else min_rank
-            if rank_threshold < min_rank
-            else rank_threshold
-        )
-        rank_threshold_long = (
-            max_rank
-            if rank_threshold_long > max_rank
-            else min_rank
-            if rank_threshold_long < min_rank
-            else rank_threshold_long
-        )
-        if cache_data_list is None or len(cache_data_list) == 0:
-            rank_pre = -1.0
+        if manager.MPNet_base:
+            # This code uses the built-in cosine similarity evaluation in milvus
+            if cosine_similarity < 0.9:
+                return None
         else:
-            cache_data_dict = {'search_result': cache_data_list[0]}
-            rank_pre = chat_cache.similarity_evaluation.evaluation(
-                None,
-                cache_data_dict,
-                extra_param=context.get("evaluation_func", None),
+            ## this is the code that uses L2 for similarity evaluation
+            similarity_threshold = chat_cache.config.similarity_threshold
+            similarity_threshold_long = chat_cache.config.similarity_threshold_long
+
+            min_rank, max_rank = chat_cache.similarity_evaluation.range()
+            rank_threshold = (max_rank - min_rank) * similarity_threshold * cache_factor
+            rank_threshold_long = (max_rank - min_rank) * similarity_threshold_long * cache_factor
+            rank_threshold = (
+                max_rank
+                if rank_threshold > max_rank
+                else min_rank
+                if rank_threshold < min_rank
+                else rank_threshold
+            )
+            rank_threshold_long = (
+                max_rank
+                if rank_threshold_long > max_rank
+                else min_rank
+                if rank_threshold_long < min_rank
+                else rank_threshold_long
             )
-        if rank_pre < rank_threshold:
-            return None
+            if cache_data_list is None or len(cache_data_list) == 0:
+                rank_pre = -1.0
+            else:
+                cache_data_dict = {'search_result': cache_data_list[0]}
+                rank_pre = chat_cache.similarity_evaluation.evaluation(
+                    None,
+                    cache_data_dict,
+                    extra_param=context.get("evaluation_func", None),
+                )
+            if rank_pre < rank_threshold:
+                return None
 
-        if USE_RERANKER:
+        if USE_RERANKER and not manager.MPNet_base:
             reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=False)
             for cache_data in cache_data_list:
                 primary_id = cache_data[1]
@@ -132,45 +142,50 @@ def adapt_query(cache_data_convert, *args, **kwargs):
                 if ret is None:
                     continue
 
-                if "deps" in context and hasattr(ret.question, "deps"):
-                    eval_query_data = {
-                        "question": context["deps"][0]["data"],
-                        "embedding": None
-                    }
-                    eval_cache_data = {
-                        "question": ret.question.deps[0].data,
-                        "answer": ret.answers[0].answer,
-                        "search_result": cache_data,
-                        "embedding": None,
-                    }
+                if manager.MPNet_base:
+                    cache_answers.append((cosine_similarity, ret[1]))
+                    cache_questions.append((cosine_similarity, ret[0]))
+                    cache_ids.append((cosine_similarity, primary_id))
                 else:
-                    eval_query_data = {
-                        "question": pre_embedding_data,
-                        "embedding": embedding_data,
-                    }
+                    if "deps" in context and hasattr(ret.question, "deps"):
+                        eval_query_data = {
+                            "question": context["deps"][0]["data"],
+                            "embedding": None
+                        }
+                        eval_cache_data = {
+                            "question": ret.question.deps[0].data,
+                            "answer": ret.answers[0].answer,
+                            "search_result": cache_data,
+                            "embedding": None,
+                        }
+                    else:
+                        eval_query_data = {
+                            "question": pre_embedding_data,
+                            "embedding": embedding_data,
+                        }
 
-                    eval_cache_data = {
-                        "question": ret[0],
-                        "answer": ret[1],
-                        "search_result": cache_data,
-                        "embedding": None
-                    }
-                rank = chat_cache.similarity_evaluation.evaluation(
-                    eval_query_data,
-                    eval_cache_data,
-                    extra_param=context.get("evaluation_func", None),
-                )
+                        eval_cache_data = {
+                            "question": ret[0],
+                            "answer": ret[1],
+                            "search_result": cache_data,
+                            "embedding": None
+                        }
+                    rank = chat_cache.similarity_evaluation.evaluation(
+                        eval_query_data,
+                        eval_cache_data,
+                        extra_param=context.get("evaluation_func", None),
+                    )
 
-                if len(pre_embedding_data) <= 256:
-                    if rank_threshold <= rank:
-                        cache_answers.append((rank, ret[1]))
-                        cache_questions.append((rank, ret[0]))
-                        cache_ids.append((rank, primary_id))
-                else:
-                    if rank_threshold_long <= rank:
-                        cache_answers.append((rank, ret[1]))
-                        cache_questions.append((rank, ret[0]))
-                        cache_ids.append((rank, primary_id))
+                    if len(pre_embedding_data) <= 256:
+                        if rank_threshold <= rank:
+                            cache_answers.append((rank, ret[1]))
+                            cache_questions.append((rank, ret[0]))
+                            cache_ids.append((rank, primary_id))
+                    else:
+                        if rank_threshold_long <= rank:
+                            cache_answers.append((rank, ret[1]))
+                            cache_questions.append((rank, ret[0]))
+                            cache_ids.append((rank, primary_id))
 
         cache_answers = sorted(cache_answers, key=lambda x: x[0], reverse=True)
         cache_questions = sorted(cache_questions, key=lambda x: x[0], reverse=True)
diff --git a/modelcache/embedding/mpnet_base.py b/modelcache/embedding/mpnet_base.py
@@ -0,0 +1,17 @@
+from sentence_transformers import SentenceTransformer
+
+class MPNet_Base:
+    def __init__(self):
+        self.dimension = 768
+        self.model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
+
+    def embedding_func(self, *args, **kwargs):
+        if not args:
+            raise ValueError("No word provided for embedding.")
+        embeddings = self.model.encode(args)
+        return embeddings[0] if len(args) == 1 else embeddings
+
+    def similarity(self, a, b):
+        if not a or not b:
+            raise ValueError("Both inputs must be non-empty for similarity calculation.")
+        return self.model.similarity(a, b)
diff --git a/modelcache/manager/vector_data/manager.py b/modelcache/manager/vector_data/manager.py
@@ -17,6 +17,8 @@
 
 COLLECTION_NAME = "modelcache"
 
+MPNet_base = False # whether to use MPNet base model for embedding, if True, will use cosine similarity evaluation in milvus
+
 
 class VectorBase:
     """
diff --git a/requirements.txt b/requirements.txt
@@ -19,5 +19,4 @@ elasticsearch==7.10.0
 snowflake-id==1.0.2
 flagembedding==1.3.4
 cryptography==45.0.2
-mediapipe==0.10.21
-protobuf==4.25.8
+sentence-transformers==4.1.0