File tree Expand file tree Collapse file tree 3 files changed +54
-0
lines changed Expand file tree Collapse file tree 3 files changed +54
-0
lines changed Original file line number Diff line number Diff line change 1+ # -*- coding: utf-8 -*-
2+ import torch
3+ import numpy as np
4+ from transformers import AutoModelForCausalLM
5+
6+
7+ model_path = ''
8+ device = torch .device ('cuda' )
9+ model = AutoModelForCausalLM .from_pretrained (model_path , local_files_only = True ).to (device )
10+ embedding_weights = model .get_input_embeddings ().weight .to ('cpu' ).detach ().numpy ()
11+ np .save ('gpt-neox-embedding.npy' , embedding_weights )
Original file line number Diff line number Diff line change 22from modelcache .utils .lazy_import import LazyImport
33huggingface = LazyImport ("huggingface" , globals (), "modelcache.embedding.huggingface" )
44data2vec = LazyImport ("data2vec" , globals (), "modelcache.embedding.data2vec" )
5+ llmEmb = LazyImport ("llmEmb" , globals (), "modelcache.embedding.llmEmb" )
56
67
78def Huggingface (model = "sentence-transformers/all-mpnet-base-v2" ):
@@ -10,3 +11,7 @@ def Huggingface(model="sentence-transformers/all-mpnet-base-v2"):
1011
1112def Data2VecAudio (model = "facebook/data2vec-audio-base-960h" ):
1213 return data2vec .Data2VecAudio (model )
14+
15+
16+ def LlmEmb2vecAudio ():
17+ return llmEmb .LlmEmb2Vec ()
Original file line number Diff line number Diff line change 1+ # -*- coding: utf-8 -*-
2+ import numpy as np
3+ from modelcache .embedding .base import BaseEmbedding
4+ from transformers import AutoTokenizer
5+ from transformers import AutoConfig
6+
7+
8+ class LlmEmb2Vec (BaseEmbedding ):
9+ def __init__ (self ):
10+
11+ self .model_name = '' # 13b-mft-embedding.npy
12+ model_path = '' # .npy file storage path
13+ model_file = model_path + self .model_name # .npy file
14+ config = AutoConfig .from_pretrained (model_path )
15+ dimension = config .hidden_size
16+ self .__dimension = dimension
17+ self .model = np .load (model_file )
18+ self .tokenizer = AutoTokenizer .from_pretrained (model_path , local_files_only = True )
19+
20+ def to_embeddings (self , data , ** _ ):
21+ """Generate embedding given text input
22+
23+ :param data: text in string.
24+ :return: a text embedding in shape of (dim,).
25+ """
26+ input_ids = self .tokenizer .encode (data , add_special_tokens = True )
27+ embedding_array = self .model [input_ids ].mean (axis = 0 )
28+ return embedding_array
29+
30+ def post_proc (self , token_embeddings , inputs ):
31+ pass
32+
33+ @property
34+ def dimension (self ):
35+ """Embedding dimension.
36+ :return: embedding dimension
37+ """
38+ return self .__dimension
You can’t perform that action at this time.
0 commit comments