diff --git a/main/manager-api/src/main/resources/db/changelog/202510141200.sql b/main/manager-api/src/main/resources/db/changelog/202510141200.sql new file mode 100644 index 0000000000..30a2aaadfd --- /dev/null +++ b/main/manager-api/src/main/resources/db/changelog/202510141200.sql @@ -0,0 +1,15 @@ +-- 为阿里百炼流式语音合成添加多语言音色配置字段 +UPDATE `ai_model_provider` SET fields = '[{"key":"api_key","type":"string","label":"API密钥"},{"key":"output_dir","type":"string","label":"输出目录"},{"key":"model","type":"string","label":"模型名称"},{"key":"format","label":"音频格式","type":"string"},{"key":"sample_rate","label":"采样率","type":"number"},{"key": "volume", "type": "number", "label": "音量"},{"key": "rate", "type": "number", "label": "语速"},{"key": "pitch", "type": "number", "label": "音调"},{"key":"voice","type":"string","label":"默认音色"},{"key": "voice_zh", "type": "string", "label": "中文音色"},{"key": "voice_yue", "type": "string", "label": "粤语音色"},{"key": "voice_en", "type": "string", "label": "英语音色"},{"key": "voice_ja", "type": "string", "label": "日语音色"},{"key": "voice_ko", "type": "string", "label": "韩语音色"}]' WHERE id = 'SYSTEM_TTS_AliBLStreamTTS'; + +-- 更新配置说明 +UPDATE `ai_model_config` SET +`doc_link` = 'https://bailian.console.aliyun.com/?apiKey=1#/api-key', +`remark` = '阿里百炼流式TTS说明: +1. 访问 https://bailian.console.aliyun.com/?apiKey=1#/api-key 创建项目并获取appkey +2. 支持实时流式合成,具有较低的延迟 +3. 支持多种音色设置和音频参数调整 +4. 使用FunASR进行语音识别时,可以自动选择对应语言音色 +5. 支持CosyVoice-V3大模型音色,价格实惠(0.4元/万字符) +6. 支持实时调节音量、语速、音调等参数 +7. 如果需要使用CosyVoice-V3模型和一些限制类型的音色,需要联系阿里百炼客服申请 +' WHERE `id` = 'TTS_AliBLStreamTTS'; \ No newline at end of file diff --git a/main/xiaozhi-server/config.yaml b/main/xiaozhi-server/config.yaml index a408739520..c69ab3b65b 100644 --- a/main/xiaozhi-server/config.yaml +++ b/main/xiaozhi-server/config.yaml @@ -838,6 +838,16 @@ TTS: access_key_secret: 你的阿里云账号access_key_secret # 截至2025年7月21日大模型音色只有北京节点采用,其他节点暂不支持 host: nls-gateway-cn-beijing.aliyuncs.com + + # 多语言音色配置 - 根据ASR识别的语言标签自动切换音色 + # 多语言仅限搭配FunASR SenseVoiceSmall模型使用 + voice_zh: longxiaochun # 中文音色 + voice_en: longchen # 英文音色 + voice_yue: longyu # 粤语音色 + voice_ja: longchen # 日语音色 + voice_ko: longchen # 韩语音色 + default_voice: longxiaochun # 默认音色(当语言标签不匹配或无语言标签时使用) + # 以下可不用设置,使用默认设置 # format: pcm # 音频格式:pcm、wav、mp3 # sample_rate: 16000 # 采样率:8000、16000、24000 diff --git a/main/xiaozhi-server/core/connection.py b/main/xiaozhi-server/core/connection.py index d5b9ca4a91..dbdc03a57b 100644 --- a/main/xiaozhi-server/core/connection.py +++ b/main/xiaozhi-server/core/connection.py @@ -129,6 +129,7 @@ def __init__( # 所以涉及到ASR的变量,需要在这里定义,属于connection的私有变量 self.asr_audio = [] self.asr_audio_queue = queue.Queue() + self.current_language_tag = None # 存储当前ASR识别的语言标签 # llm相关变量 self.llm_finish_task = True diff --git a/main/xiaozhi-server/core/handle/receiveAudioHandle.py b/main/xiaozhi-server/core/handle/receiveAudioHandle.py index 4eaf9ab1de..84026f000b 100644 --- a/main/xiaozhi-server/core/handle/receiveAudioHandle.py +++ b/main/xiaozhi-server/core/handle/receiveAudioHandle.py @@ -1,3 +1,4 @@ +import re import time import json import asyncio @@ -41,6 +42,32 @@ async def startToChat(conn, text): # 检查输入是否是JSON格式(包含说话人信息) speaker_name = None actual_text = text + language_tag = None + + # 检查当前使用的ASR是否为FunASR(本地或服务版本) + is_funasr = False + if hasattr(conn, 'asr') and conn.asr: + asr_module = conn.asr.__class__.__module__ + if 'fun_local' in asr_module or 'fun_server' in asr_module: + is_funasr = True + conn.logger.bind(tag=TAG).debug(f"检测到FunASR语音识别: {asr_module}") + + # 只有在使用FunASR时才处理语言标签 + if is_funasr: + # 检查是否包含语言标签(如<|zh|>、<|en|>等) + lang_pattern = r'<\|([a-z]{2,3})\|>' + lang_match = re.search(lang_pattern, text) + if lang_match: + language_tag = lang_match.group(1) + conn.current_language_tag = language_tag + conn.logger.bind(tag=TAG).info(f"检测到FunASR语言标签: {language_tag}") + + # 移除语言标签,保留纯文本内容 + actual_text = re.sub(lang_pattern, '', text).strip() + conn.logger.bind(tag=TAG).debug(f"移除语言标签后的文本: {actual_text}") + else: + # 没有检测到语言标签时,清空之前的标签 + conn.current_language_tag = None try: # 尝试解析JSON格式的输入 @@ -63,6 +90,10 @@ async def startToChat(conn, text): else: conn.current_speaker = None + # 如果不是FunASR,清空语言标签,不影响其他ASR + if not is_funasr: + conn.current_language_tag = None + if conn.need_bind: await check_bind_device(conn) return diff --git a/main/xiaozhi-server/core/providers/asr/fun_local.py b/main/xiaozhi-server/core/providers/asr/fun_local.py index 217f17ff7f..a8f90a4600 100644 --- a/main/xiaozhi-server/core/providers/asr/fun_local.py +++ b/main/xiaozhi-server/core/providers/asr/fun_local.py @@ -6,6 +6,7 @@ from config.logger import setup_logging from typing import Optional, Tuple, List from core.providers.asr.base import ASRProviderBase +from core.providers.asr.utils import custom_lang_filter from funasr import AutoModel from funasr.utils.postprocess_utils import rich_transcription_postprocess import shutil @@ -99,7 +100,11 @@ async def speech_to_text( use_itn=True, batch_size_s=60, ) - text = rich_transcription_postprocess(result[0]["text"]) + + # text = rich_transcription_postprocess(result[0]["text"]) + + # Handle language tags + text = custom_lang_filter(result[0]["text"]) logger.bind(tag=TAG).debug( f"语音识别耗时: {time.time() - start_time:.3f}s | 结果: {text}" ) diff --git a/main/xiaozhi-server/core/providers/asr/fun_server.py b/main/xiaozhi-server/core/providers/asr/fun_server.py index 7eb9e7848f..db07fce24a 100644 --- a/main/xiaozhi-server/core/providers/asr/fun_server.py +++ b/main/xiaozhi-server/core/providers/asr/fun_server.py @@ -1,12 +1,12 @@ from typing import Optional, Tuple, List from core.providers.asr.base import ASRProviderBase +from core.providers.asr.utils import custom_lang_filter from core.providers.asr.dto.dto import InterfaceType import ssl import json import websockets from config.logger import setup_logging import asyncio -import re TAG = __name__ logger = setup_logging() @@ -151,9 +151,13 @@ async def speech_to_text( # Get the result from the receive task result = receive_task.result() - match = re.match(r"<\|(.*?)\|><\|(.*?)\|><\|(.*?)\|>(.*)", result) - if match: - result = match.group(4).strip() + + # match = re.match(r"<\|(.*?)\|><\|(.*?)\|><\|(.*?)\|>(.*)", result) + # if match: + # result = match.group(4).strip() + + # Handle language tags + result = custom_lang_filter(result) return ( result, file_path, diff --git a/main/xiaozhi-server/core/providers/asr/utils.py b/main/xiaozhi-server/core/providers/asr/utils.py new file mode 100644 index 0000000000..1021794ca4 --- /dev/null +++ b/main/xiaozhi-server/core/providers/asr/utils.py @@ -0,0 +1,45 @@ +import re +from config.logger import setup_logging + +TAG = __name__ +logger = setup_logging() + + +def custom_lang_filter(text): + """ + 自定义过滤函数:只保留语言标签,移除其他所有标签 + + 用于FunASR识别结果的处理,保留语言标签(如<|zh|>、<|en|>等), + 但移除其他所有格式的标签(如时间戳、情感标签等) + + Args: + text: ASR识别的原始文本,可能包含多种标签 + + Returns: + str: 处理后的文本,只保留语言标签(如果存在) + + Examples: + >>> custom_lang_filter("<|zh|><|emotion:happy|>你好") + '<|zh|>你好' + >>> custom_lang_filter("<|en|>hello world") + '<|en|>hello world' + >>> custom_lang_filter("<|timestamp:1.5|>测试") + '测试' + """ + # 定义语言标签模式 + lang_pattern = r"<\|(zh|en|yue|ja|ko|nospeech)\|>" + lang_tags = re.findall(lang_pattern, text) + + # 移除所有 < | ... | > 格式的标签 + clean_text = re.sub(r"<\|.*?\|>", "", text) + + # 在开头添加语言标签(如果存在) + if lang_tags: + if len(lang_tags) > 1: + logger.bind(tag=TAG).warning( + f"检测到多个语言标签: {lang_tags},仅使用第一个: {lang_tags[0]}" + ) + clean_text = f"<|{lang_tags[0]}|>{clean_text}" + + return clean_text.strip() + diff --git a/main/xiaozhi-server/core/providers/tts/alibl_stream.py b/main/xiaozhi-server/core/providers/tts/alibl_stream.py index c0d187f0bd..252cfda3ca 100644 --- a/main/xiaozhi-server/core/providers/tts/alibl_stream.py +++ b/main/xiaozhi-server/core/providers/tts/alibl_stream.py @@ -39,6 +39,13 @@ def __init__(self, config, delete_audio_file): if config.get("private_voice"): self.voice = config.get("private_voice") + # 多语言音色配置 + self.voice_zh = config.get("voice_zh", self.voice) # 中文音色 + self.voice_yue = config.get("voice_yue", self.voice) # 粤语音色 + self.voice_en = config.get("voice_en", self.voice) # 英语音色 + self.voice_ja = config.get("voice_ja", self.voice) # 日语音色 + self.voice_ko = config.get("voice_ko", self.voice) # 韩语音色 + # 音频参数配置 self.format = config.get("format", "pcm") sample_rate = config.get("sample_rate", "24000") @@ -65,6 +72,38 @@ def __init__(self, config, delete_audio_file): sample_rate=self.sample_rate, channels=1, frame_size_ms=60 ) + def get_voice_by_language(self, language_tag): + """根据语言标签返回对应的音色(仅在FunASR语音识别时生效)""" + if not language_tag: + return self.voice + + # 检查当前ASR是否为FunASR + is_funasr = False + if hasattr(self, 'conn') and self.conn and hasattr(self.conn, 'asr') and self.conn.asr: + asr_module = self.conn.asr.__class__.__module__ + if 'fun_local' in asr_module or 'fun_server' in asr_module: + is_funasr = True + logger.bind(tag=TAG).debug(f"当前使用FunASR语音识别: {asr_module}") + + # 只有在使用FunASR时才应用多语言音色选择 + if is_funasr: + language_tag = language_tag.lower() + voice_map = { + 'zh': self.voice_zh, + 'yue': self.voice_yue, + 'en': self.voice_en, + 'ja': self.voice_ja, + 'ko': self.voice_ko + } + + selected_voice = voice_map.get(language_tag, self.voice) + logger.bind(tag=TAG).info(f"FunASR语言标签 '{language_tag}' 选择音色: {selected_voice}") + return selected_voice + else: + # 非FunASR时使用默认音色 + logger.bind(tag=TAG).debug(f"非FunASR语音识别,使用默认音色: {self.voice}") + return self.voice + async def _ensure_connection(self): """确保WebSocket连接可用,支持60秒内连接复用""" try: @@ -228,6 +267,9 @@ async def start_session(self, session_id): # 启动监听任务 self._monitor_task = asyncio.create_task(self._start_monitor_tts_response()) + # 根据当前语言标签选择音色 + current_voice = self.get_voice_by_language(getattr(self.conn, 'current_language_tag', None)) + # 发送run-task消息启动会话 run_task_message = { "header": { @@ -242,7 +284,7 @@ async def start_session(self, session_id): "model": self.model, "parameters": { "text_type": "PlainText", - "voice": self.voice, + "voice": current_voice, "format": self.format, "sample_rate": self.sample_rate, "volume": self.volume, @@ -412,6 +454,13 @@ async def _generate_audio(): ) try: + # 选择音色:优先使用当前连接的语言标签,否则使用默认音色 + # 注意:to_tts可能在独立场景下调用(无active connection) + if hasattr(self, 'conn') and self.conn and hasattr(self.conn, 'current_language_tag'): + current_voice = self.get_voice_by_language(self.conn.current_language_tag) + else: + current_voice = self.voice + # 发送run-task消息启动会话 run_task_message = { "header": { @@ -426,7 +475,7 @@ async def _generate_audio(): "model": self.model, "parameters": { "text_type": "PlainText", - "voice": self.voice, + "voice": current_voice, "format": self.format, "sample_rate": self.sample_rate, "volume": self.volume,