voxcpm2_api_tts.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624
  1. """
  2. VoxCPM2 API 调用模式 TTS
  3. 通过 HTTP API 调用独立的 VoxCPM2 模型服务,避免在数字人进程中加载模型
  4. """
  5. from __future__ import annotations
  6. import time
  7. import os
  8. import re
  9. import hashlib
  10. import numpy as np
  11. import soundfile as sf
  12. import resampy
  13. import requests
  14. from io import BytesIO
  15. from queue import Queue
  16. from threading import Thread, Event
  17. from enum import Enum
  18. from typing import TYPE_CHECKING
  19. if TYPE_CHECKING:
  20. from basereal import BaseReal
  21. from logger import logger
  22. class State(Enum):
  23. RUNNING = 0
  24. PAUSE = 1
  25. class VoxCPM2APITTS:
  26. """通过 API 调用 VoxCPM2 语音合成服务"""
  27. def __init__(self, opt, parent: BaseReal):
  28. self.opt = opt
  29. self.parent = parent
  30. # API 配置(中性模式:降低 CFG,不使用 prompt)
  31. self.api_url = getattr(opt, 'VOXCPM2_API_URL', 'http://localhost:6003')
  32. self.ref_audio_path = getattr(opt, 'VOXCPM2_REF_WAV', 'voice_output.wav')
  33. self.ref_text = getattr(opt, 'VOXCPM2_REF_TEXT', '你好,买水果,卖水果,新鲜的水果。')
  34. # 中性模式:降低 CFG 值,减少情绪模仿
  35. self.cfg_value = getattr(opt, 'CFG_VALUE', 1.5) # 从 2.0 降低到 1.5
  36. self.inference_timesteps = getattr(opt, 'INFERENCE_TIMESTEPS', 10)
  37. # 音频参数
  38. self.fps = opt.fps
  39. self.sample_rate = 16000
  40. self.chunk = self.sample_rate // self.fps # 320 samples per chunk (20ms)
  41. # 消息队列
  42. self.msgqueue = Queue()
  43. self.high_priority_queue = Queue()
  44. self.image_description_queue = Queue()
  45. self.is_playing_image_description = False
  46. self.state = State.RUNNING
  47. self.interrupted_messages = []
  48. self.current_msg = None
  49. self.current_msg_progress = 0
  50. self.interrupt_flag = Event()
  51. # 预生成音频配置(已禁用,使用实时克隆)
  52. self.pre_gen_dir = "./data/pre_generated_tts"
  53. self.audio_map = {} # 不再加载预生成音频
  54. self.use_pre_gen = False # 禁用预生成音频
  55. # Image_Analysis 音频目录配置
  56. self.image_analysis_dir = "/mnt/nvme1data/Digital_Human/Image_Analysis/wav/wav"
  57. self.played_audio_files = set()
  58. # 实时克隆音频保存目录
  59. self.kelong_dir = "/mnt/nvme1data/Digital_Human/Image_Analysis/knowledge_kelong"
  60. os.makedirs(self.kelong_dir, exist_ok=True)
  61. logger.info(f"📁 实时克隆音频保存目录:{self.kelong_dir}")
  62. # 移除预生成音频循环播放配置
  63. # self.pre_gen_index = 0 # 不再需要
  64. # self.is_playing_pre_gen_loop = False # 不再需要
  65. # 检查 API 服务
  66. self._check_api_service()
  67. logger.info(f"VoxCPM2 API TTS 初始化完成(实时克隆模式):")
  68. logger.info(f" API 地址:{self.api_url}")
  69. logger.info(f" 参考音频:{self.ref_audio_path}")
  70. logger.info(f" 参考文本:{self.ref_text}")
  71. logger.info(f" CFG 值:{self.cfg_value}")
  72. logger.info(f" 推理步数:{self.inference_timesteps}")
  73. logger.info(f" 预生成音频:已禁用(使用实时克隆)")
  74. logger.info(f" 克隆音频保存:{self.kelong_dir}")
  75. logger.info(f" Image_Analysis 目录:{self.image_analysis_dir}")
  76. def _check_api_service(self):
  77. """检查 API 服务是否可用"""
  78. try:
  79. logger.info(f"🔍 检查 VoxCPM2 API 服务:{self.api_url}/health")
  80. resp = requests.get(f"{self.api_url}/health", timeout=5)
  81. if resp.status_code == 200:
  82. health = resp.json()
  83. logger.info(f"✅ VoxCPM2 API 服务正常")
  84. logger.info(f" 模型:{health.get('model', 'Unknown')}")
  85. logger.info(f" 设备:{health.get('device', 'Unknown')}")
  86. logger.info(f" 采样率:{health.get('sample_rate', 'Unknown')} Hz")
  87. else:
  88. raise Exception(f"API 服务异常:HTTP {resp.status_code}")
  89. except requests.exceptions.ConnectionError:
  90. logger.error(f"❌ 无法连接到 VoxCPM2 API 服务:{self.api_url}")
  91. logger.error("请先启动 API 服务:")
  92. logger.error(" cd /mnt/nvme1data/model && python voxcpm2_api.py")
  93. raise
  94. except Exception as e:
  95. logger.error(f"❌ VoxCPM2 API 服务检查失败:{e}")
  96. raise
  97. def reset_interrupt_flag(self):
  98. """重置打断标志"""
  99. self.interrupt_flag.clear()
  100. def set_interrupt_flag(self):
  101. """设置打断标志"""
  102. self.interrupt_flag.set()
  103. def flush_talk(self):
  104. """停止当前播放并清空待处理的消息队列"""
  105. with self.msgqueue.mutex:
  106. remaining_msgs = list(self.msgqueue.queue)
  107. if remaining_msgs:
  108. self.interrupted_messages.extend(remaining_msgs)
  109. self.msgqueue.queue.clear()
  110. if self.current_msg:
  111. self.interrupted_messages.append(self.current_msg)
  112. self.current_msg = None
  113. self.state = State.PAUSE
  114. def resume_interrupted(self):
  115. """恢复播放被中断的消息"""
  116. if self.interrupted_messages:
  117. with self.msgqueue.mutex:
  118. for msg in self.interrupted_messages:
  119. self.msgqueue.put(msg)
  120. self.interrupted_messages.clear()
  121. self.state = State.RUNNING
  122. return True
  123. return False
  124. def _load_audio_map(self) -> dict:
  125. """加载预生成音频映射表"""
  126. import json
  127. import os
  128. map_path = os.path.join(self.pre_gen_dir, "audio_map.json")
  129. if os.path.exists(map_path):
  130. try:
  131. with open(map_path, 'r', encoding='utf-8') as f:
  132. return json.load(f)
  133. except Exception as e:
  134. logger.warning(f"加载预生成音频映射表失败: {e}")
  135. return {}
  136. def _get_pre_generated_audio(self, text: str):
  137. """获取预生成的音频数据,如果不存在返回 None"""
  138. import hashlib
  139. import os
  140. if not self.use_pre_gen:
  141. return None
  142. text_hash = hashlib.md5(text.encode('utf-8')).hexdigest()[:16]
  143. audio_path = os.path.join(self.pre_gen_dir, f"{text_hash}.wav")
  144. if not os.path.exists(audio_path):
  145. return None
  146. try:
  147. stream, sample_rate = sf.read(audio_path, dtype='float32')
  148. if stream.ndim > 1:
  149. stream = stream[:, 0]
  150. if sample_rate != self.sample_rate and stream.shape[0] > 0:
  151. stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
  152. return stream
  153. except Exception as e:
  154. logger.warning(f"读取预生成音频失败:{e}")
  155. return None
  156. def _get_newest_image_analysis_audio(self):
  157. """获取最新的未播放的图片分析音频文件"""
  158. import re
  159. import time
  160. if not os.path.exists(self.image_analysis_dir):
  161. return None
  162. try:
  163. audio_files = []
  164. all_files = os.listdir(self.image_analysis_dir)
  165. logger.info(f"🔍 Image_Analysis 目录共有 {len(all_files)} 个文件")
  166. logger.info(f" 已播放文件数:{len(self.played_audio_files)}")
  167. for f in all_files:
  168. if f.endswith('.wav'):
  169. file_path = os.path.join(self.image_analysis_dir, f)
  170. match = re.search(r'hifi_clone_(\d{8}_\d{6})', f)
  171. if match:
  172. time_str = match.group(1)
  173. file_timestamp = time.mktime(time.strptime(time_str, "%Y%m%d_%H%M%S"))
  174. logger.info(f" 📁 {f} - 文件名时间:{time.strftime('%H:%M:%S', time.localtime(file_timestamp))}, 已播放:{f in self.played_audio_files}")
  175. else:
  176. file_timestamp = os.path.getmtime(file_path)
  177. logger.info(f" 📁 {f} - 修改时间:{time.strftime('%H:%M:%S', time.localtime(file_timestamp))}, 已播放:{f in self.played_audio_files}")
  178. is_played = f in self.played_audio_files
  179. if not is_played:
  180. audio_files.append((file_path, file_timestamp, f))
  181. if not audio_files:
  182. logger.info(" ⚠️ 没有未播放的音频文件")
  183. return None
  184. audio_files.sort(key=lambda x: x[1], reverse=True)
  185. newest_file = audio_files[0][2]
  186. logger.info(f" ✅ 发现最新未播放文件:{newest_file}")
  187. return audio_files[0]
  188. except Exception as e:
  189. logger.error(f"获取图片分析音频失败:{e}", exc_info=True)
  190. return None
  191. def _get_next_pre_gen_audio(self):
  192. """获取下一条预生成音频(循环播放)"""
  193. if not self.use_pre_gen or len(self.audio_map) == 0:
  194. return None
  195. try:
  196. audio_files = sorted([
  197. f for f in os.listdir(self.pre_gen_dir)
  198. if f.endswith('.wav')
  199. ])
  200. if not audio_files:
  201. return None
  202. audio_file = audio_files[self.pre_gen_index % len(audio_files)]
  203. self.pre_gen_index += 1
  204. audio_path = os.path.join(self.pre_gen_dir, audio_file)
  205. stream, sample_rate = sf.read(audio_path, dtype='float32')
  206. if stream.ndim > 1:
  207. stream = stream[:, 0]
  208. if sample_rate != self.sample_rate and stream.shape[0] > 0:
  209. stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
  210. return stream
  211. except Exception as e:
  212. logger.error(f"获取预生成音频失败:{e}")
  213. return None
  214. def generate_audio_via_api(self, text: str, save_to_kelong: bool = True) -> np.ndarray:
  215. """通过 API 生成音频,返回 16kHz numpy 数组
  216. Args:
  217. text: 要转换的文本
  218. save_to_kelong: 是否保存到 knowledge_kelong 目录
  219. """
  220. try:
  221. logger.info(f"🎤 调用 VoxCPM2 API 生成音频(中性模式): {text[:30]}...")
  222. # 中性模式配置:只使用 reference_wav_path,不使用 prompt
  223. payload = {
  224. "text": text,
  225. "reference_wav_path": self.ref_audio_path, # 音色参考
  226. # 不使用 prompt_wav_path 和 prompt_text(避免情绪模仿)
  227. "cfg_value": self.cfg_value, # 降低 CFG,减少情绪
  228. "inference_timesteps": self.inference_timesteps
  229. }
  230. start_time = time.time()
  231. resp = requests.post(
  232. f"{self.api_url}/v1/tts/generate_audio",
  233. json=payload,
  234. timeout=120
  235. )
  236. if resp.status_code != 200:
  237. raise Exception(f"API 返回错误:HTTP {resp.status_code} - {resp.text}")
  238. api_sample_rate = int(resp.headers.get('X-Sample-Rate', 48000))
  239. generate_time = time.time() - start_time
  240. logger.info(f"✅ API 音频生成完成,耗时:{generate_time:.2f}秒")
  241. logger.info(f" API 返回采样率:{api_sample_rate} Hz")
  242. wav_data = BytesIO(resp.content)
  243. stream, sr = sf.read(wav_data, dtype='float32')
  244. if stream.ndim > 1:
  245. stream = stream[:, 0]
  246. # 保存原始 48kHz 音频到 knowledge_kelong 目录
  247. if save_to_kelong:
  248. try:
  249. timestamp = int(time.time())
  250. # 使用文本前 20 个字符作为文件名(去除特殊字符)
  251. safe_text = re.sub(r'[\\/*?:"<>|]', '', text[:20])
  252. filename = f"{timestamp}_{safe_text}.wav"
  253. kelong_path = os.path.join(self.kelong_dir, filename)
  254. # 保存 48kHz 原始音频(重采样之前)
  255. sf.write(kelong_path, stream, sr, format='WAV')
  256. logger.info(f"💾 已保存克隆音频:{kelong_path}")
  257. logger.info(f" 采样率:{sr} Hz,文件大小:{os.path.getsize(kelong_path)} bytes")
  258. except Exception as e:
  259. logger.error(f"保存克隆音频失败:{e}")
  260. # 重采样到 16kHz(数字人播放需要)
  261. if sr != self.sample_rate:
  262. logger.info(f" 重采样:{sr} Hz → {self.sample_rate} Hz")
  263. resample_start = time.time()
  264. # 使用高质量重采样滤波器,避免滋啦声和高频噪声
  265. stream = resampy.resample(
  266. x=stream,
  267. sr_orig=sr,
  268. sr_new=self.sample_rate,
  269. filter='kaiser_best' # 使用 Kaiser 最佳滤波器
  270. )
  271. logger.info(f" 重采样完成,耗时:{time.time()-resample_start:.2f}秒")
  272. # 添加淡入淡出处理,避免首尾爆音和咔嗒声
  273. fade_duration = 0.015 # 15ms 淡入淡出
  274. fade_samples = int(fade_duration * self.sample_rate)
  275. if len(stream) > 2 * fade_samples:
  276. # 创建淡入淡出曲线(使用余弦曲线更平滑)
  277. fade_in = (1.0 - np.cos(np.linspace(0, np.pi, fade_samples))) / 2.0
  278. fade_out = (1.0 - np.cos(np.linspace(np.pi, 0, fade_samples))) / 2.0
  279. # 应用淡入淡出
  280. stream[:fade_samples] *= fade_in
  281. stream[-fade_samples:] *= fade_out
  282. logger.info(f" 已应用淡入淡出处理({fade_duration*1000:.0f}ms)")
  283. logger.info(f" 音频采样点数:{stream.shape[0]}")
  284. logger.info(f" 音频时长:{stream.shape[0] / self.sample_rate:.2f}s")
  285. return stream
  286. except requests.exceptions.Timeout:
  287. logger.error("❌ API 请求超时(120秒)")
  288. raise Exception("语音生成超时")
  289. except Exception as e:
  290. logger.error(f"❌ API 调用失败:{e}")
  291. raise
  292. def _play_audio_stream(self, stream: np.ndarray, text: str, textevent: dict) -> bool:
  293. """播放音频流,返回是否完整播放"""
  294. streamlen = stream.shape[0]
  295. idx = 0
  296. total_chunks = streamlen // self.chunk
  297. chunk_count = 0
  298. logger.info(f" 📦 音频流总帧数:{streamlen}, chunk 大小:{self.chunk}, 预计发送 {total_chunks} 个 chunk")
  299. logger.info(f" ⏱️ 预计播放时长:{total_chunks * 0.02:.2f}秒")
  300. start_time = time.time()
  301. expected_frame_time = 0.0
  302. frame_duration = 0.02
  303. while streamlen >= self.chunk and self.state == State.RUNNING:
  304. if self.interrupt_flag.is_set() or not self.high_priority_queue.empty():
  305. logger.info(" ⚠️ 播放过程中发现高优先级消息,中断播放")
  306. return False
  307. eventpoint = {}
  308. streamlen -= self.chunk
  309. chunk_count += 1
  310. if idx == 0:
  311. eventpoint = {'status': 'start', 'text': text}
  312. eventpoint.update(**textevent)
  313. logger.info(f" ▶️ 开始播放第 {chunk_count}/{total_chunks} 个 chunk (START)")
  314. elif streamlen < self.chunk:
  315. eventpoint = {'status': 'end', 'text': text}
  316. eventpoint.update(**textevent)
  317. logger.info(f" ◀️ 开始播放第 {chunk_count}/{total_chunks} 个 chunk (END)")
  318. else:
  319. if chunk_count % 10 == 1:
  320. logger.info(f" ➡️ 播放第 {chunk_count}/{total_chunks} 个 chunk")
  321. if self.state != State.RUNNING:
  322. logger.info(f" ⚠️ 状态不是 RUNNING,停止播放")
  323. return False
  324. if self.interrupt_flag.is_set() or not self.high_priority_queue.empty():
  325. logger.info(" ⚠️ 发送音频帧前发现高优先级消息,中断播放")
  326. return False
  327. self.parent.put_audio_frame(stream[idx:idx + self.chunk], eventpoint)
  328. idx += self.chunk
  329. expected_frame_time += frame_duration
  330. elapsed = time.time() - start_time
  331. delay = expected_frame_time - elapsed
  332. if delay > 0.005:
  333. sleep_time = min(delay, 0.04)
  334. time.sleep(sleep_time)
  335. elif delay < -0.1:
  336. logger.warning(f" ⚠️ 播放落后 {abs(delay):.3f}s,跳过一些帧以追赶进度")
  337. actual_duration = time.time() - start_time
  338. logger.info(f" ✅ 所有 {chunk_count} 个 chunk 发送完成,实际耗时:{actual_duration:.2f}s (预期:{chunk_count * 0.02:.2f}s)")
  339. return True
  340. def put_msg_txt(self, msg: str, datainfo: dict = {}):
  341. """将文本消息放入队列"""
  342. if len(msg) > 0:
  343. self.msgqueue.put((msg, datainfo))
  344. def put_high_priority_msg(self, msg: str, datainfo: dict = {}):
  345. """添加高优先级消息"""
  346. if len(msg) > 0:
  347. self.high_priority_queue.put((msg, datainfo))
  348. def put_image_description(self, msg: str, datainfo: dict = {}):
  349. """添加图像描述消息"""
  350. if len(msg) > 0:
  351. logger.info(f"收到图像描述消息,长度:{len(msg)}字")
  352. self.image_description_queue.put((msg, datainfo))
  353. def render(self, quit_event):
  354. """启动 TTS 处理线程"""
  355. process_thread = Thread(target=self.process_tts, args=(quit_event,))
  356. process_thread.start()
  357. def process_tts(self, quit_event):
  358. """处理 TTS 消息队列"""
  359. was_playing_pre_gen = False
  360. while not quit_event.is_set():
  361. try:
  362. msg = None
  363. # 优先级 1: 用户问答
  364. if not self.high_priority_queue.empty():
  365. msg = self.high_priority_queue.get_nowait()
  366. logger.info("★★★★★ 处理用户问答(最高优先级) ★★★★★")
  367. self.state = State.RUNNING
  368. if msg and len(msg) >= 2 and isinstance(msg[0], str) and msg[0].strip():
  369. self.txt_to_audio(msg)
  370. if was_playing_pre_gen:
  371. logger.info("用户问答播放完成,恢复播放预生成音频")
  372. continue
  373. # 优先级 2: Image_Analysis 新音频
  374. check_start_time = time.time()
  375. newest_audio = self._get_newest_image_analysis_audio()
  376. if newest_audio is not None:
  377. file_path, file_timestamp, filename = newest_audio
  378. detect_time = time.time()
  379. logger.info("⏳ 等待文件写入完成...")
  380. wait_start = time.time()
  381. last_size = 0
  382. stable_count = 0
  383. max_wait = 10
  384. while (time.time() - wait_start) < max_wait:
  385. try:
  386. current_size = os.path.getsize(file_path)
  387. if current_size == last_size and current_size > 0:
  388. stable_count += 1
  389. if stable_count >= 3:
  390. wait_end = time.time()
  391. logger.info(f"✅ 文件写入完成,等待耗时:{(wait_end - wait_start)*1000:.0f}ms,文件大小:{current_size} bytes")
  392. break
  393. else:
  394. stable_count = 0
  395. last_size = current_size
  396. time.sleep(0.15)
  397. except:
  398. time.sleep(0.15)
  399. time_since_creation = detect_time - file_timestamp
  400. logger.info("★★★ 发现新的图片分析音频,立即播放 ★★★")
  401. logger.info(f" 文件名:{filename}")
  402. logger.info(f" ⏱️ 文件生成到检测耗时:{time_since_creation:.3f}秒")
  403. was_playing_pre_gen = True
  404. play_start_time = time.time()
  405. self._play_image_analysis_audio()
  406. play_end_time = time.time()
  407. logger.info("=" * 80)
  408. logger.info("⏱️⏱️⏱️ 【图片分析音频时间统计】 ⏱️⏱️⏱️")
  409. logger.info(f" 📊 从文件生成到开始播放:{play_start_time - file_timestamp:.3f}秒")
  410. logger.info(f" 📊 实际播放时长:{play_end_time - play_start_time:.3f}秒")
  411. logger.info("=" * 80)
  412. logger.info("图片描述播放完成,将继续播放预生成音频")
  413. continue
  414. # 优先级 3: 循环播放预生成音频
  415. pre_gen_stream = self._get_next_pre_gen_audio()
  416. if pre_gen_stream is not None:
  417. logger.info(f"★ 循环播放预生成音频 #{self.pre_gen_index}")
  418. eventpoint = {'status': 'start', 'text': '预生成介绍'}
  419. was_playing_pre_gen = True
  420. self._play_audio_stream(pre_gen_stream, "[预生成介绍]", eventpoint)
  421. continue
  422. was_playing_pre_gen = False
  423. try:
  424. msg = self.msgqueue.get(block=True, timeout=0.5)
  425. except queue.Empty:
  426. continue
  427. if msg and len(msg) >= 2 and isinstance(msg[0], str) and not msg[0].strip():
  428. continue
  429. self.current_msg = msg
  430. self.current_msg_progress = 0
  431. if self.state == State.RUNNING:
  432. self.txt_to_audio(msg)
  433. self.current_msg = None
  434. self.current_msg_progress = 0
  435. except Exception as e:
  436. logger.error(f"process_tts 错误:{e}")
  437. continue
  438. logger.info('voxcpm2_api_tts thread stop')
  439. def txt_to_audio(self, msg: tuple[str, dict]):
  440. """将文本转换为音频(通过 API)"""
  441. text, textevent = msg
  442. t = time.time()
  443. if self.state != State.RUNNING:
  444. return
  445. if not text.strip():
  446. return
  447. self.reset_interrupt_flag()
  448. if not self.high_priority_queue.empty():
  449. logger.info("发现高优先级消息,跳过当前普通消息处理")
  450. return
  451. try:
  452. # 首先尝试使用预生成音频
  453. pre_gen_stream = self._get_pre_generated_audio(text)
  454. if pre_gen_stream is not None:
  455. logger.info(f"使用预生成音频: {text[:30]}...")
  456. if self.state != State.RUNNING:
  457. return
  458. if self.interrupt_flag.is_set() or not self.high_priority_queue.empty():
  459. return
  460. self._play_audio_stream(pre_gen_stream, text, textevent)
  461. logger.info(f'-------预生成音频播放完成,耗时:{time.time()-t:.4f}s')
  462. return
  463. # 调用 API 生成音频
  464. logger.info(f"调用 API 生成音频: {text[:30]}...")
  465. stream = self.generate_audio_via_api(text)
  466. if self.state != State.RUNNING:
  467. return
  468. if self.interrupt_flag.is_set() or not self.high_priority_queue.empty():
  469. return
  470. self._play_audio_stream(stream, text, textevent)
  471. except Exception as e:
  472. logger.error(f"VoxCPM2 API TTS 失败:{e}")
  473. def _play_image_analysis_audio(self):
  474. """播放图片分析音频"""
  475. result = self._get_newest_image_analysis_audio()
  476. if result is None:
  477. return False
  478. file_path, file_timestamp, filename = result
  479. try:
  480. logger.info(f"🎵 播放图片分析音频:{filename}")
  481. logger.info(f" 音频文件路径:{file_path}")
  482. stream, sample_rate = sf.read(file_path, dtype='float32')
  483. logger.info(f" 音频原始信息:sample_rate={sample_rate}, shape={stream.shape}")
  484. if stream.ndim > 1:
  485. stream = stream[:, 0]
  486. if sample_rate != self.sample_rate and stream.shape[0] > 0:
  487. stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self.sample_rate)
  488. logger.info(f" 处理后音频信息:shape={stream.shape}, chunks={stream.shape[0]//self.chunk}")
  489. eventpoint = {'status': 'start', 'text': '图片描述'}
  490. completed = self._play_audio_stream(stream, f"[图片描述] {filename}", eventpoint)
  491. self.played_audio_files.add(filename)
  492. if completed:
  493. logger.info(f"✅ 图片分析音频播放完成:{filename}")
  494. else:
  495. logger.info(f"⚠️ 图片分析音频被中断:{filename}")
  496. return completed
  497. except Exception as e:
  498. logger.error(f"播放图片分析音频失败:{e}", exc_info=True)
  499. return False