@@ -126,7 +126,7 @@ def infer_duration(self, sequence, speaker_embedding=None, alpha=1.0, non_empty_
126126 "input_mask" : input_mask ,
127127 "pos_mask" : pos_mask }
128128 if speaker_embedding is not None :
129- inputs ["speaker_embedding" ] = np .array ([ speaker_embedding ] )
129+ inputs ["speaker_embedding" ] = np .array (speaker_embedding )
130130 self .duration_predictor_request .infer (inputs )
131131 else :
132132 self .duration_predictor_request .infer (inputs = {"input_seq" : sequence })
@@ -154,7 +154,7 @@ def infer_mel(self, aligned_emb, non_empty_symbols, speaker_embedding=None):
154154 "data_mask" : data_mask ,
155155 "pos_mask" : pos_mask }
156156 if speaker_embedding is not None :
157- inputs ["speaker_embedding" ] = np .array ([ speaker_embedding ] )
157+ inputs ["speaker_embedding" ] = np .array (speaker_embedding )
158158 self .forward_request .infer (inputs )
159159 else :
160160 self .forward_request .infer (inputs = {"data" : aligned_emb })
@@ -215,7 +215,7 @@ def forward(self, text, alpha=1.0, speaker_id=19, speaker_emb=None):
215215 if speaker_emb is not None :
216216 speaker_embedding = speaker_emb
217217 else :
218- speaker_embedding = self .speaker_embeddings [speaker_id , :]
218+ speaker_embedding = [ self .speaker_embeddings [speaker_id , :] ]
219219
220220 aligned_emb = self .forward_duration_prediction_by_delimiters (text , speaker_embedding , alpha )
221221
0 commit comments