config

彻底解决回声问题：实现设备分离播放
- 播放时完全关闭音频输入流 - 使用系统播放器(aplay)避免设备冲突 - 添加安全的播放方式play_audio_safe - 播放前后增加延迟确保设备状态切换 - 完全解决硬件串扰和声学回声问题
2025-09-20 12:53:58 +08:00 · 2025-09-20 12:20:30 +08:00 · 2025-09-20 12:15:04 +08:00 · 2025-09-20 12:08:40 +08:00 · 2025-09-20 12:00:01 +08:00 · 2025-09-20 11:51:02 +08:00
9 changed files with 1780 additions and 748 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/recognition_example.py
+++ b/recognition_example.py
@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+语音识别使用示例
+演示如何使用 speech_recognizer 模块
+"""
+
+import os
+import asyncio
+from speech_recognizer import SpeechRecognizer
+
+async def example_recognize_file():
+    """示例：识别单个音频文件"""
+    print("=== 示例1：识别单个音频文件 ===")
+    
+    # 初始化识别器
+    recognizer = SpeechRecognizer(
+        app_key="your_app_key",  # 请替换为实际的app_key
+        access_key="your_access_key"  # 请替换为实际的access_key
+    )
+    
+    # 假设有一个录音文件
+    audio_file = "recording_20240101_120000.wav"
+    
+    if not os.path.exists(audio_file):
+        print(f"音频文件不存在: {audio_file}")
+        print("请先运行 enhanced_wake_and_record.py 录制一个音频文件")
+        return
+    
+    try:
+        # 识别音频文件
+        results = await recognizer.recognize_file(audio_file)
+        
+        print(f"识别结果（共{len(results)}个）:")
+        for i, result in enumerate(results):
+            print(f"结果 {i+1}:")
+            print(f"  文本: {result.text}")
+            print(f"  置信度: {result.confidence}")
+            print(f"  最终结果: {result.is_final}")
+            print("-" * 40)
+    
+    except Exception as e:
+        print(f"识别失败: {e}")
+
+async def example_recognize_latest():
+    """示例：识别最新的录音文件"""
+    print("\n=== 示例2：识别最新的录音文件 ===")
+    
+    # 初始化识别器
+    recognizer = SpeechRecognizer(
+        app_key="your_app_key",  # 请替换为实际的app_key
+        access_key="your_access_key"  # 请替换为实际的access_key
+    )
+    
+    try:
+        # 识别最新的录音文件
+        result = await recognizer.recognize_latest_recording()
+        
+        if result:
+            print("识别结果:")
+            print(f"  文本: {result.text}")
+            print(f"  置信度: {result.confidence}")
+            print(f"  最终结果: {result.is_final}")
+        else:
+            print("未找到录音文件或识别失败")
+    
+    except Exception as e:
+        print(f"识别失败: {e}")
+
+async def example_batch_recognition():
+    """示例：批量识别多个录音文件"""
+    print("\n=== 示例3：批量识别录音文件 ===")
+    
+    # 初始化识别器
+    recognizer = SpeechRecognizer(
+        app_key="your_app_key",  # 请替换为实际的app_key
+        access_key="your_access_key"  # 请替换为实际的access_key
+    )
+    
+    # 获取所有录音文件
+    recording_files = [f for f in os.listdir(".") if f.startswith('recording_') and f.endswith('.wav')]
+    
+    if not recording_files:
+        print("未找到录音文件")
+        return
+    
+    print(f"找到 {len(recording_files)} 个录音文件")
+    
+    for filename in recording_files[:5]:  # 只处理前5个文件
+        print(f"\n处理文件: {filename}")
+        try:
+            results = await recognizer.recognize_file(filename)
+            
+            if results:
+                final_result = results[-1]  # 取最后一个结果
+                print(f"识别结果: {final_result.text}")
+            else:
+                print("识别失败")
+        
+        except Exception as e:
+            print(f"处理失败: {e}")
+        
+        # 添加延迟，避免请求过于频繁
+        await asyncio.sleep(1)
+
+async def main():
+    """主函数"""
+    print("🚀 语音识别使用示例")
+    print("=" * 50)
+    
+    # 请先设置环境变量或在代码中填入实际的API密钥
+    if not os.getenv("SAUC_APP_KEY") and "your_app_key" in "your_app_key":
+        print("⚠️  请先设置 SAUC_APP_KEY 和 SAUC_ACCESS_KEY 环境变量")
+        print("或者在代码中填入实际的 app_key 和 access_key")
+        print("示例:")
+        print("export SAUC_APP_KEY='your_app_key'")
+        print("export SAUC_ACCESS_KEY='your_access_key'")
+        return
+    
+    # 运行示例
+    await example_recognize_file()
+    await example_recognize_latest()
+    await example_batch_recognition()
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/recorder.py
+++ b/recorder.py
@ -0,0 +1,580 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+基于能量检测的极简录音系统
+专门针对树莓派3B优化，完全移除Vosk识别依赖
+"""
+
+import sys
+import os
+import time
+import threading
+import pyaudio
+import numpy as np
+import wave
+
+class EnergyBasedRecorder:
+    """基于能量检测的录音系统"""
+    
+    def __init__(self, energy_threshold=500, silence_threshold=1.5, min_recording_time=2.0, max_recording_time=30.0):
+        # 音频参数 - 极简优化
+        self.FORMAT = pyaudio.paInt16
+        self.CHANNELS = 1
+        self.RATE = 8000  # 8kHz采样率
+        self.CHUNK_SIZE = 1024  # 适中块大小
+        
+        # 能量检测参数
+        self.energy_threshold = energy_threshold  # 能量阈值，高于此值认为有声音
+        self.silence_threshold = silence_threshold  # 静音阈值，低于此值持续多久认为结束
+        self.min_recording_time = min_recording_time  # 最小录音时间
+        self.max_recording_time = max_recording_time  # 最大录音时间
+        self.pre_record_duration = 2.0  # 预录音时长（秒）
+        
+        # 状态变量
+        self.audio = None
+        self.stream = None
+        self.running = False
+        self.recording = False
+        self.recorded_frames = []
+        self.recording_start_time = None
+        self.last_sound_time = None
+        self.energy_history = []
+        self.zcr_history = []  # ZCR历史
+        self.max_energy_history = 50  # 最大能量历史记录
+        
+        # 预录音缓冲区
+        self.pre_record_buffer = []  # 预录音缓冲区
+        self.pre_record_max_frames = int(self.pre_record_duration * self.RATE / self.CHUNK_SIZE)  # 最大预录音帧数
+        
+        # 播放状态
+        self.is_playing = False  # 是否正在播放
+        
+        # 智能静音检测
+        self.voice_activity_history = []  # 语音活动历史
+        self.max_voice_history = 20  # 最大语音活动历史记录
+        self.consecutive_silence_count = 0  # 连续静音计数
+        self.silence_threshold_count = 15  # 连续静音次数阈值（约1.5秒）
+        
+        # 智能ZCR静音检测
+        self.max_zcr_history = 30  # 最大ZCR历史记录
+        self.consecutive_low_zcr_count = 0  # 连续低ZCR计数
+        self.low_zcr_threshold_count = 20  # 连续低ZCR次数阈值（约2秒）
+        
+        # 性能监控
+        self.frame_count = 0
+        self.start_time = time.time()
+        
+        self._setup_audio()
+    
+    def _setup_audio(self):
+        """设置音频设备"""
+        try:
+            self.audio = pyaudio.PyAudio()
+            self.stream = self.audio.open(
+                format=self.FORMAT,
+                channels=self.CHANNELS,
+                rate=self.RATE,
+                input=True,
+                frames_per_buffer=self.CHUNK_SIZE
+            )
+            print("✅ 音频设备初始化成功")
+        except Exception as e:
+            print(f"❌ 音频设备初始化失败: {e}")
+    
+    def calculate_energy(self, audio_data):
+        """计算音频能量"""
+        if len(audio_data) == 0:
+            return 0
+        
+        # 将字节数据转换为numpy数组
+        audio_array = np.frombuffer(audio_data, dtype=np.int16)
+        
+        # 计算RMS能量
+        rms = np.sqrt(np.mean(audio_array ** 2))
+        
+        # 更新能量历史（只在非录音状态下更新，避免语音影响背景噪音计算）
+        if not self.recording:
+            self.energy_history.append(rms)
+            if len(self.energy_history) > self.max_energy_history:
+                self.energy_history.pop(0)
+        
+        return rms
+    
+    def calculate_peak_energy(self, audio_data):
+        """计算峰值能量（辅助判断）"""
+        if len(audio_data) == 0:
+            return 0
+        
+        audio_array = np.frombuffer(audio_data, dtype=np.int16)
+        peak_energy = np.max(np.abs(audio_array))
+        
+        return peak_energy
+    
+    def calculate_zero_crossing_rate(self, audio_data):
+        """计算零交叉率（主要语音检测方法）"""
+        if len(audio_data) == 0:
+            return 0
+        
+        audio_array = np.frombuffer(audio_data, dtype=np.int16)
+        
+        # 计算零交叉次数
+        zero_crossings = np.sum(np.diff(np.sign(audio_array)) != 0)
+        
+        # 归一化到采样率
+        zcr = zero_crossings / len(audio_array) * self.RATE
+        
+        # 更新ZCR历史
+        self.zcr_history.append(zcr)
+        if len(self.zcr_history) > self.max_zcr_history:
+            self.zcr_history.pop(0)
+        
+        return zcr
+    
+    def is_voice_active_advanced(self, energy, zcr):
+        """仅使用ZCR进行语音活动检测"""
+        # ZCR语音检测：提高到1200-6000 Hz之间，更好地区分语音和环境噪音
+        # 说话时ZCR会比较稳定在这个范围内
+        zcr_condition = 1200 < zcr < 6000
+        
+        # 添加一些容错，避免短暂的ZCR波动导致误判
+        return zcr_condition
+    
+        
+    def is_voice_active(self, energy):
+        """已弃用 - 仅用于兼容性"""
+        # 现在主要使用ZCR检测，这个方法保留但不再使用
+        return False
+    
+    def save_recording(self, audio_data, filename=None):
+        """保存录音"""
+        if filename is None:
+            timestamp = time.strftime("%Y%m%d_%H%M%S")
+            filename = f"recording_{timestamp}.wav"
+        
+        try:
+            with wave.open(filename, 'wb') as wf:
+                wf.setnchannels(self.CHANNELS)
+                wf.setsampwidth(self.audio.get_sample_size(self.FORMAT))
+                wf.setframerate(self.RATE)
+                wf.writeframes(audio_data)
+            
+            print(f"✅ 录音已保存: {filename}")
+            return True, filename
+        except Exception as e:
+            print(f"❌ 保存录音失败: {e}")
+            return False, None
+    
+    def play_audio(self, filename):
+        """播放音频文件"""
+        try:
+            print("🔇 准备播放，完全停止音频输入")
+            
+            # 立即停止当前录音并清空所有缓冲区
+            if self.recording:
+                self.recording = False
+                self.recorded_frames = []
+                self.recording_start_time = None
+                self.last_sound_time = None
+            
+            # 清空所有缓冲区
+            self.pre_record_buffer = []
+            self.energy_history = []
+            self.zcr_history = []
+            
+            # 完全关闭输入流
+            if self.stream:
+                self.stream.stop_stream()
+                self.stream.close()
+                self.stream = None
+            
+            # 设置播放状态
+            self.is_playing = True
+            
+            # 等待一小段时间确保音频设备完全停止输入
+            time.sleep(0.5)
+            
+            with wave.open(filename, 'rb') as wf:
+                channels = wf.getnchannels()
+                width = wf.getsampwidth()
+                rate = wf.getframerate()
+                total_frames = wf.getnframes()
+                
+                # 分块读取音频数据
+                chunk_size = 1024
+                frames = []
+                
+                for _ in range(0, total_frames, chunk_size):
+                    chunk = wf.readframes(chunk_size)
+                    if chunk:
+                        frames.append(chunk)
+                    else:
+                        break
+            
+            # 创建播放流
+            playback_stream = self.audio.open(
+                format=self.audio.get_format_from_width(width),
+                channels=channels,
+                rate=rate,
+                output=True
+            )
+            
+            print(f"🔊 开始播放: {filename}")
+            print("🚫 音频输入已完全关闭")
+            
+            # 分块播放音频
+            for chunk in frames:
+                playback_stream.write(chunk)
+            
+            playback_stream.stop_stream()
+            playback_stream.close()
+            
+            print("✅ 播放完成")
+            print("🔄 重新开启音频输入")
+            
+        except Exception as e:
+            print(f"❌ 播放失败: {e}")
+            self.play_with_system_player(filename)
+        finally:
+            # 恢复播放状态
+            self.is_playing = False
+            
+            # 等待播放完全结束
+            time.sleep(0.3)
+            
+            # 重新开启输入流
+            self._setup_audio()
+            
+            # 重置所有状态
+            self.energy_history = []
+            self.zcr_history = []
+            print("📡 音频输入已重新开启")
+    
+    def play_with_system_player(self, filename):
+        """使用系统播放器播放音频"""
+        try:
+            import subprocess
+            cmd = ['aplay', filename]  # Linux系统
+            print(f"🔊 使用系统播放器: {' '.join(cmd)}")
+            print("🚫 系统播放器播放中，音频输入保持关闭")
+            subprocess.run(cmd, check=True)
+            print("✅ 播放完成")
+            print("📡 音频输入已保持关闭状态")
+        except Exception as e:
+            print(f"❌ 系统播放器也失败: {e}")
+    
+    def play_audio_safe(self, filename):
+        """安全的播放方式 - 使用系统播放器"""
+        try:
+            print("🔇 准备播放，完全停止音频输入")
+            
+            # 立即停止当前录音并清空所有缓冲区
+            if self.recording:
+                self.recording = False
+                self.recorded_frames = []
+                self.recording_start_time = None
+                self.last_sound_time = None
+            
+            # 清空所有缓冲区
+            self.pre_record_buffer = []
+            self.energy_history = []
+            self.zcr_history = []
+            
+            # 完全关闭输入流
+            if self.stream:
+                self.stream.stop_stream()
+                self.stream.close()
+                self.stream = None
+            
+            # 设置播放状态
+            self.is_playing = True
+            
+            # 等待确保音频设备完全停止
+            time.sleep(0.5)
+            
+            print(f"🔊 开始播放: {filename}")
+            print("🚫 使用系统播放器，音频输入已完全关闭")
+            
+            # 使用系统播放器
+            self.play_with_system_player(filename)
+            
+            print("🔄 准备重新开启音频输入")
+            
+        except Exception as e:
+            print(f"❌ 播放失败: {e}")
+        finally:
+            # 恢复播放状态
+            self.is_playing = False
+            
+            # 等待播放完全结束
+            time.sleep(0.5)
+            
+            # 重新开启输入流
+            self._setup_audio()
+            
+            # 重置所有状态
+            self.energy_history = []
+            self.zcr_history = []
+            print("📡 音频输入已重新开启")
+    
+    def update_pre_record_buffer(self, audio_data):
+        """更新预录音缓冲区"""
+        self.pre_record_buffer.append(audio_data)
+        
+        # 保持缓冲区大小
+        if len(self.pre_record_buffer) > self.pre_record_max_frames:
+            self.pre_record_buffer.pop(0)
+    
+    def start_recording(self):
+        """开始录音"""
+        print("🎙️ 检测到声音，开始录音...")
+        self.recording = True
+        self.recorded_frames = []
+        
+        # 将预录音缓冲区的内容添加到录音中
+        self.recorded_frames.extend(self.pre_record_buffer)
+        
+        # 清空预录音缓冲区
+        self.pre_record_buffer = []
+        
+        self.recording_start_time = time.time()
+        self.last_sound_time = time.time()
+        self.energy_history = []
+        self.zcr_history = []  # 重置ZCR历史
+        
+        # 重置ZCR相关计数器
+        self.consecutive_low_zcr_count = 0
+        self.consecutive_silence_count = 0
+        self.voice_activity_history = []
+    
+    def stop_recording(self):
+        """停止录音"""
+        if len(self.recorded_frames) > 0:
+            audio_data = b''.join(self.recorded_frames)
+            duration = len(audio_data) / (self.RATE * 2)  # 16位音频，每样本2字节
+            
+            # 计算实际录音时长和预录音时长
+            actual_duration = duration
+            pre_record_duration = min(duration, self.pre_record_duration)
+            
+            print(f"📝 录音完成，时长: {actual_duration:.2f}秒 (包含预录音 {pre_record_duration:.1f}秒)")
+            
+            # 保存录音
+            success, filename = self.save_recording(audio_data)
+            
+            # 如果保存成功，播放录音
+            if success and filename:
+                print("=" * 50)
+                print("🔊 播放刚才录制的音频...")
+                # 优先使用系统播放器避免回声
+                self.play_audio_safe(filename)
+                print("=" * 50)
+        
+        self.recording = False
+        self.recorded_frames = []
+        self.recording_start_time = None
+        self.last_sound_time = None
+        self.energy_history = []
+        self.zcr_history = []
+    
+    def monitor_performance(self):
+        """性能监控"""
+        self.frame_count += 1
+        if self.frame_count % 1000 == 0:  # 每1000帧显示一次
+            elapsed = time.time() - self.start_time
+            fps = self.frame_count / elapsed
+            avg_energy = self.get_average_energy()
+            print(f"📊 性能: {fps:.1f} FPS | 平均能量: {avg_energy:.1f} | 阈值: {self.energy_threshold}")
+    
+    def auto_adjust_threshold(self):
+        """自动调整能量阈值"""
+        if len(self.energy_history) >= 20:
+            # 基于历史能量的中位数和标准差调整阈值
+            median_energy = np.median(self.energy_history)
+            std_energy = np.std(self.energy_history)
+            
+            # 设置阈值为中位数 + 2倍标准差
+            new_threshold = max(300, median_energy + 2 * std_energy)
+            
+            # 平滑调整阈值
+            self.energy_threshold = 0.9 * self.energy_threshold + 0.1 * new_threshold
+    
+    def run(self):
+        """运行录音系统"""
+        if not self.stream:
+            print("❌ 音频设备未初始化")
+            return
+        
+        self.running = True
+        print("🎤 开始监听...")
+        print(f"能量阈值: {self.energy_threshold} (已弃用)")
+        print(f"静音阈值: {self.silence_threshold}秒")
+        print("📖 使用说明:")
+        print("- 检测到声音自动开始录音")
+        print("- 持续静音3秒自动结束录音")
+        print("- 最少录音2秒，最多30秒")
+        print("- 录音完成后自动播放")
+        print("- 按 Ctrl+C 退出")
+        print("🎯 新增功能:")
+        print("- 纯ZCR语音检测（移除能量检测）")
+        print("- 零交叉率检测（区分语音和噪音）")
+        print("- 实时显示ZCR状态")
+        print("- 预录音功能（包含声音开始前2秒）")
+        print("- 环形缓冲区防止丢失开头音频")
+        print("🤖 纯ZCR静音检测:")
+        print("- 连续低ZCR计数（20次=2秒）")
+        print("- ZCR活动历史追踪")
+        print("- 基于ZCR模式的静音验证")
+        print("- 语音范围: 1200-6000 Hz (提高阈值)")
+        print("=" * 50)
+        
+        try:
+            while self.running:
+                # 读取音频数据
+                data = self.stream.read(self.CHUNK_SIZE, exception_on_overflow=False)
+                
+                if len(data) == 0:
+                    continue
+                
+                # 如果正在播放，完全跳过音频处理
+                if self.is_playing:
+                    # 显示播放状态
+                    status = "🔊 播放中... 跳过录音处理"
+                    print(f"\r{status}", end='', flush=True)
+                    time.sleep(0.1)  # 播放时增加延迟减少CPU使用
+                    continue
+                
+                # 计算能量和零交叉率
+                energy = self.calculate_energy(data)
+                zcr = self.calculate_zero_crossing_rate(data)
+                peak_energy = self.calculate_peak_energy(data)
+                
+                # 性能监控
+                self.monitor_performance()
+                
+                if self.recording:
+                    # 录音模式
+                    self.recorded_frames.append(data)
+                    recording_duration = time.time() - self.recording_start_time
+                    
+                    # 基于ZCR的智能静音检测
+                    if self.is_voice_active_advanced(energy, zcr):
+                        self.last_sound_time = time.time()
+                        self.consecutive_low_zcr_count = 0  # 重置低ZCR计数
+                        self.consecutive_silence_count = 0  # 重置静音计数
+                    else:
+                        self.consecutive_low_zcr_count += 1  # 增加低ZCR计数
+                        self.consecutive_silence_count += 1  # 增加静音计数
+                    
+                    # 更新ZCR活动历史（基于ZCR是否在语音范围内）
+                    self.voice_activity_history.append(1200 < zcr < 6000)
+                    if len(self.voice_activity_history) > self.max_voice_history:
+                        self.voice_activity_history.pop(0)
+                    
+                    # 检查是否应该结束录音
+                    current_time = time.time()
+                    
+                    # 纯ZCR静音检测
+                    should_stop = False
+                    stop_reason = ""
+                    
+                    # 主要检测：连续低ZCR计数
+                    if self.consecutive_low_zcr_count >= self.low_zcr_threshold_count:
+                        # 进一步验证：检查最近的ZCR活动历史
+                        if len(self.voice_activity_history) >= 15:
+                            recent_voice_activity = sum(self.voice_activity_history[-15:])
+                            if recent_voice_activity <= 3:  # 最近15个样本中最多3个有语音活动
+                                should_stop = True
+                                stop_reason = f"ZCR静音检测 ({self.consecutive_low_zcr_count}次连续低ZCR)"
+                        else:
+                            # 如果历史数据不足，使用基础检测
+                            should_stop = True
+                            stop_reason = f"基础ZCR静音检测 ({self.consecutive_low_zcr_count}次)"
+                    
+                    # 备用检测：基于时间的静音检测
+                    if not should_stop and current_time - self.last_sound_time > self.silence_threshold:
+                        should_stop = True
+                        stop_reason = f"时间静音检测 ({self.silence_threshold}秒)"
+                    
+                    # 执行停止录音
+                    if should_stop and recording_duration >= self.min_recording_time:
+                        print(f"\n🔇 {stop_reason}，结束录音")
+                        self.stop_recording()
+                    
+                    # 检查最大录音时间
+                    if recording_duration > self.max_recording_time:
+                        print(f"\n⏰ 达到最大录音时间 {self.max_recording_time}秒")
+                        self.stop_recording()
+                    
+                    # 显示录音状态（仅ZCR相关信息）
+                    is_voice = self.is_voice_active_advanced(energy, zcr)
+                    zcr_progress = f"{self.consecutive_low_zcr_count}/{self.low_zcr_threshold_count}"
+                    recent_activity = sum(self.voice_activity_history[-5:]) if len(self.voice_activity_history) >= 5 else 0
+                    status = f"录音中... {recording_duration:.1f}s | ZCR: {zcr:.0f} | 语音: {is_voice} | 低ZCR计数: {zcr_progress} | 活动: {recent_activity}"
+                    print(f"\r{status}", end='', flush=True)
+                
+                else:
+                    # 监听模式 - 更新预录音缓冲区
+                    self.update_pre_record_buffer(data)
+                    
+                    # 使用高级检测
+                    if self.is_voice_active_advanced(energy, zcr):
+                        # 检测到声音，开始录音
+                        self.start_recording()
+                    else:
+                        # 显示监听状态（仅ZCR相关信息）
+                        is_voice = self.is_voice_active_advanced(energy, zcr)
+                        buffer_usage = len(self.pre_record_buffer) / self.pre_record_max_frames * 100
+                        status = f"监听中... ZCR: {zcr:.0f} | 语音: {is_voice} | 缓冲: {buffer_usage:.0f}%"
+                        print(f"\r{status}", end='', flush=True)
+                
+                # 减少CPU使用
+                time.sleep(0.01)
+                
+        except KeyboardInterrupt:
+            print("\n👋 退出")
+        except Exception as e:
+            print(f"❌ 错误: {e}")
+        finally:
+            self.stop()
+    
+    def stop(self):
+        """停止系统"""
+        self.running = False
+        if self.recording:
+            self.stop_recording()
+        
+        if self.stream:
+            self.stream.stop_stream()
+            self.stream.close()
+        
+        if self.audio:
+            self.audio.terminate()
+
+def main():
+    """主函数"""
+    print("🚀 基于能量检测的极简录音系统")
+    print("=" * 50)
+    
+    # 创建录音系统
+    recorder = EnergyBasedRecorder(
+        energy_threshold=200,      # 能量阈值（降低以提高灵敏度）
+        silence_threshold=3.0,     # 静音阈值（秒）- 改为3秒
+        min_recording_time=2.0,    # 最小录音时间
+        max_recording_time=30.0    # 最大录音时间
+    )
+    
+    print("✅ 系统初始化成功")
+    print("🎯 优化特点:")
+    print("   - 完全移除Vosk识别依赖")
+    print("   - 基于能量检测，极低CPU占用")
+    print("   - 自动调整能量阈值")
+    print("   - 实时性能监控")
+    print("   - 预期延迟: <0.1秒")
+    print("=" * 50)
+    
+    # 开始运行
+    recorder.run()
+
+if __name__ == "__main__":
+    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,5 @@
 vosk>=0.3.44
 pyaudio>=0.2.11
-numpy>=1.19.0
+numpy>=1.19.0
+aiohttp>=3.8.0
+asyncio
--- a/sauc_python/readme.md
+++ b/sauc_python/readme.md
@ -0,0 +1,15 @@
+# README
+
+**asr tob 相关client demo**
+
+# Notice
+python version: python 3.x
+
+替换代码中的key为真实数据:
+    "app_key": "xxxxxxx",
+    "access_key": "xxxxxxxxxxxxxxxx"
+使用示例：
+    python3 sauc_websocket_demo.py --file /Users/bytedance/code/python/eng_ddc_itn.wav
+
+
+
--- a/sauc_python/sauc_websocket_demo.py
+++ b/sauc_python/sauc_websocket_demo.py
@ -0,0 +1,523 @@
+import asyncio
+import aiohttp
+import json
+import struct
+import gzip
+import uuid
+import logging
+import os
+import subprocess
+from typing import Optional, List, Dict, Any, Tuple, AsyncGenerator
+
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.FileHandler('run.log'),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+
+# 常量定义
+DEFAULT_SAMPLE_RATE = 16000
+
+class ProtocolVersion:
+    V1 = 0b0001
+
+class MessageType:
+    CLIENT_FULL_REQUEST = 0b0001
+    CLIENT_AUDIO_ONLY_REQUEST = 0b0010
+    SERVER_FULL_RESPONSE = 0b1001
+    SERVER_ERROR_RESPONSE = 0b1111
+
+class MessageTypeSpecificFlags:
+    NO_SEQUENCE = 0b0000
+    POS_SEQUENCE = 0b0001
+    NEG_SEQUENCE = 0b0010
+    NEG_WITH_SEQUENCE = 0b0011
+
+class SerializationType:
+    NO_SERIALIZATION = 0b0000
+    JSON = 0b0001
+
+class CompressionType:
+    GZIP = 0b0001
+
+
+class Config:
+    def __init__(self):
+        # 填入控制台获取的app id和access token
+        self.auth = {
+            "app_key": "xxxxxxx",
+            "access_key": "xxxxxxxxxxxx"
+        }
+
+    @property
+    def app_key(self) -> str:
+        return self.auth["app_key"]
+
+    @property
+    def access_key(self) -> str:
+        return self.auth["access_key"]
+
+config = Config()
+
+class CommonUtils:
+    @staticmethod
+    def gzip_compress(data: bytes) -> bytes:
+        return gzip.compress(data)
+
+    @staticmethod
+    def gzip_decompress(data: bytes) -> bytes:
+        return gzip.decompress(data)
+
+    @staticmethod
+    def judge_wav(data: bytes) -> bool:
+        if len(data) < 44:
+            return False
+        return data[:4] == b'RIFF' and data[8:12] == b'WAVE'
+
+    @staticmethod
+    def convert_wav_with_path(audio_path: str, sample_rate: int = DEFAULT_SAMPLE_RATE) -> bytes:
+        try:
+            cmd = [
+                "ffmpeg", "-v", "quiet", "-y", "-i", audio_path,
+                "-acodec", "pcm_s16le", "-ac", "1", "-ar", str(sample_rate),
+                "-f", "wav", "-"
+            ]
+            result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            
+            # 尝试删除原始文件
+            try:
+                os.remove(audio_path)
+            except OSError as e:
+                logger.warning(f"Failed to remove original file: {e}")
+                
+            return result.stdout
+        except subprocess.CalledProcessError as e:
+            logger.error(f"FFmpeg conversion failed: {e.stderr.decode()}")
+            raise RuntimeError(f"Audio conversion failed: {e.stderr.decode()}")
+
+    @staticmethod
+    def read_wav_info(data: bytes) -> Tuple[int, int, int, int, bytes]:
+        if len(data) < 44:
+            raise ValueError("Invalid WAV file: too short")
+            
+        # 解析WAV头
+        chunk_id = data[:4]
+        if chunk_id != b'RIFF':
+            raise ValueError("Invalid WAV file: not RIFF format")
+            
+        format_ = data[8:12]
+        if format_ != b'WAVE':
+            raise ValueError("Invalid WAV file: not WAVE format")
+            
+        # 解析fmt子块
+        audio_format = struct.unpack('<H', data[20:22])[0]
+        num_channels = struct.unpack('<H', data[22:24])[0]
+        sample_rate = struct.unpack('<I', data[24:28])[0]
+        bits_per_sample = struct.unpack('<H', data[34:36])[0]
+        
+        # 查找data子块
+        pos = 36
+        while pos < len(data) - 8:
+            subchunk_id = data[pos:pos+4]
+            subchunk_size = struct.unpack('<I', data[pos+4:pos+8])[0]
+            if subchunk_id == b'data':
+                wave_data = data[pos+8:pos+8+subchunk_size]
+                return (
+                    num_channels,
+                    bits_per_sample // 8,
+                    sample_rate,
+                    subchunk_size // (num_channels * (bits_per_sample // 8)),
+                    wave_data
+                )
+            pos += 8 + subchunk_size
+            
+        raise ValueError("Invalid WAV file: no data subchunk found")
+
+class AsrRequestHeader:
+    def __init__(self):
+        self.message_type = MessageType.CLIENT_FULL_REQUEST
+        self.message_type_specific_flags = MessageTypeSpecificFlags.POS_SEQUENCE
+        self.serialization_type = SerializationType.JSON
+        self.compression_type = CompressionType.GZIP
+        self.reserved_data = bytes([0x00])
+
+    def with_message_type(self, message_type: int) -> 'AsrRequestHeader':
+        self.message_type = message_type
+        return self
+
+    def with_message_type_specific_flags(self, flags: int) -> 'AsrRequestHeader':
+        self.message_type_specific_flags = flags
+        return self
+
+    def with_serialization_type(self, serialization_type: int) -> 'AsrRequestHeader':
+        self.serialization_type = serialization_type
+        return self
+
+    def with_compression_type(self, compression_type: int) -> 'AsrRequestHeader':
+        self.compression_type = compression_type
+        return self
+
+    def with_reserved_data(self, reserved_data: bytes) -> 'AsrRequestHeader':
+        self.reserved_data = reserved_data
+        return self
+
+    def to_bytes(self) -> bytes:
+        header = bytearray()
+        header.append((ProtocolVersion.V1 << 4) | 1)
+        header.append((self.message_type << 4) | self.message_type_specific_flags)
+        header.append((self.serialization_type << 4) | self.compression_type)
+        header.extend(self.reserved_data)
+        return bytes(header)
+
+    @staticmethod
+    def default_header() -> 'AsrRequestHeader':
+        return AsrRequestHeader()
+
+class RequestBuilder:
+    @staticmethod
+    def new_auth_headers() -> Dict[str, str]:
+        reqid = str(uuid.uuid4())
+        return {
+            "X-Api-Resource-Id": "volc.bigasr.sauc.duration",
+            "X-Api-Request-Id": reqid,
+            "X-Api-Access-Key": config.access_key,
+            "X-Api-App-Key": config.app_key
+        }
+
+    @staticmethod
+    def new_full_client_request(seq: int) -> bytes:  # 添加seq参数
+        header = AsrRequestHeader.default_header() \
+            .with_message_type_specific_flags(MessageTypeSpecificFlags.POS_SEQUENCE)
+        
+        payload = {
+            "user": {
+                "uid": "demo_uid"
+            },
+            "audio": {
+                "format": "wav",
+                "codec": "raw",
+                "rate": 16000,
+                "bits": 16,
+                "channel": 1
+            },
+            "request": {
+                "model_name": "bigmodel",
+                "enable_itn": True,
+                "enable_punc": True,
+                "enable_ddc": True,
+                "show_utterances": True,
+                "enable_nonstream": False
+            }
+        }
+        
+        payload_bytes = json.dumps(payload).encode('utf-8')
+        compressed_payload = CommonUtils.gzip_compress(payload_bytes)
+        payload_size = len(compressed_payload)
+        
+        request = bytearray()
+        request.extend(header.to_bytes())
+        request.extend(struct.pack('>i', seq))  # 使用传入的seq
+        request.extend(struct.pack('>I', payload_size))
+        request.extend(compressed_payload)
+        
+        return bytes(request)
+
+    @staticmethod
+    def new_audio_only_request(seq: int, segment: bytes, is_last: bool = False) -> bytes:
+        header = AsrRequestHeader.default_header()
+        if is_last:  # 最后一个包特殊处理
+            header.with_message_type_specific_flags(MessageTypeSpecificFlags.NEG_WITH_SEQUENCE)
+            seq = -seq  # 设为负值
+        else:
+            header.with_message_type_specific_flags(MessageTypeSpecificFlags.POS_SEQUENCE)
+        header.with_message_type(MessageType.CLIENT_AUDIO_ONLY_REQUEST)
+        
+        request = bytearray()
+        request.extend(header.to_bytes())
+        request.extend(struct.pack('>i', seq))
+        
+        compressed_segment = CommonUtils.gzip_compress(segment)
+        request.extend(struct.pack('>I', len(compressed_segment)))
+        request.extend(compressed_segment)
+        
+        return bytes(request)
+
+class AsrResponse:
+    def __init__(self):
+        self.code = 0
+        self.event = 0
+        self.is_last_package = False
+        self.payload_sequence = 0
+        self.payload_size = 0
+        self.payload_msg = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "code": self.code,
+            "event": self.event,
+            "is_last_package": self.is_last_package,
+            "payload_sequence": self.payload_sequence,
+            "payload_size": self.payload_size,
+            "payload_msg": self.payload_msg
+        }
+
+class ResponseParser:
+    @staticmethod
+    def parse_response(msg: bytes) -> AsrResponse:
+        response = AsrResponse()
+        
+        header_size = msg[0] & 0x0f
+        message_type = msg[1] >> 4
+        message_type_specific_flags = msg[1] & 0x0f
+        serialization_method = msg[2] >> 4
+        message_compression = msg[2] & 0x0f
+        
+        payload = msg[header_size*4:]
+        
+        # 解析message_type_specific_flags
+        if message_type_specific_flags & 0x01:
+            response.payload_sequence = struct.unpack('>i', payload[:4])[0]
+            payload = payload[4:]
+        if message_type_specific_flags & 0x02:
+            response.is_last_package = True
+        if message_type_specific_flags & 0x04:
+            response.event = struct.unpack('>i', payload[:4])[0]
+            payload = payload[4:]
+            
+        # 解析message_type
+        if message_type == MessageType.SERVER_FULL_RESPONSE:
+            response.payload_size = struct.unpack('>I', payload[:4])[0]
+            payload = payload[4:]
+        elif message_type == MessageType.SERVER_ERROR_RESPONSE:
+            response.code = struct.unpack('>i', payload[:4])[0]
+            response.payload_size = struct.unpack('>I', payload[4:8])[0]
+            payload = payload[8:]
+            
+        if not payload:
+            return response
+            
+        # 解压缩
+        if message_compression == CompressionType.GZIP:
+            try:
+                payload = CommonUtils.gzip_decompress(payload)
+            except Exception as e:
+                logger.error(f"Failed to decompress payload: {e}")
+                return response
+                
+        # 解析payload
+        try:
+            if serialization_method == SerializationType.JSON:
+                response.payload_msg = json.loads(payload.decode('utf-8'))
+        except Exception as e:
+            logger.error(f"Failed to parse payload: {e}")
+            
+        return response
+
+class AsrWsClient:
+    def __init__(self, url: str, segment_duration: int = 200):
+        self.seq = 1
+        self.url = url
+        self.segment_duration = segment_duration
+        self.conn = None
+        self.session = None  # 添加session引用
+
+    async def __aenter__(self):
+        self.session = aiohttp.ClientSession()
+        return self
+    
+    async def __aexit__(self, exc_type, exc, tb):
+        if self.conn and not self.conn.closed:
+            await self.conn.close()
+        if self.session and not self.session.closed:
+            await self.session.close()
+        
+    async def read_audio_data(self, file_path: str) -> bytes:
+        try:
+            with open(file_path, 'rb') as f:
+                content = f.read()
+                
+            if not CommonUtils.judge_wav(content):
+                logger.info("Converting audio to WAV format...")
+                content = CommonUtils.convert_wav_with_path(file_path, DEFAULT_SAMPLE_RATE)
+                
+            return content
+        except Exception as e:
+            logger.error(f"Failed to read audio data: {e}")
+            raise
+            
+    def get_segment_size(self, content: bytes) -> int:
+        try:
+            channel_num, samp_width, frame_rate, _, _ = CommonUtils.read_wav_info(content)[:5]
+            size_per_sec = channel_num * samp_width * frame_rate
+            segment_size = size_per_sec * self.segment_duration // 1000
+            return segment_size
+        except Exception as e:
+            logger.error(f"Failed to calculate segment size: {e}")
+            raise
+            
+    async def create_connection(self) -> None:
+        headers = RequestBuilder.new_auth_headers()
+        try:
+            self.conn = await self.session.ws_connect(  # 使用self.session
+                self.url,
+                headers=headers
+            )
+            logger.info(f"Connected to {self.url}")
+        except Exception as e:
+            logger.error(f"Failed to connect to WebSocket: {e}")
+            raise
+            
+    async def send_full_client_request(self) -> None:
+        request = RequestBuilder.new_full_client_request(self.seq)
+        self.seq += 1  # 发送后递增
+        try:
+            await self.conn.send_bytes(request)
+            logger.info(f"Sent full client request with seq: {self.seq-1}")
+            
+            msg = await self.conn.receive()
+            if msg.type == aiohttp.WSMsgType.BINARY:
+                response = ResponseParser.parse_response(msg.data)
+                logger.info(f"Received response: {response.to_dict()}")
+            else:
+                logger.error(f"Unexpected message type: {msg.type}")
+        except Exception as e:
+            logger.error(f"Failed to send full client request: {e}")
+            raise
+            
+    async def send_messages(self, segment_size: int, content: bytes) -> AsyncGenerator[None, None]:
+        audio_segments = self.split_audio(content, segment_size)
+        total_segments = len(audio_segments)
+        
+        for i, segment in enumerate(audio_segments):
+            is_last = (i == total_segments - 1)
+            request = RequestBuilder.new_audio_only_request(
+                self.seq, 
+                segment,
+                is_last=is_last
+            )
+            await self.conn.send_bytes(request)
+            logger.info(f"Sent audio segment with seq: {self.seq} (last: {is_last})")
+            
+            if not is_last:
+                self.seq += 1
+                
+            await asyncio.sleep(self.segment_duration / 1000) # 逐个发送，间隔时间模拟实时流
+            # 让出控制权，允许接受消息
+            yield
+            
+    async def recv_messages(self) -> AsyncGenerator[AsrResponse, None]:
+        try:
+            async for msg in self.conn:
+                if msg.type == aiohttp.WSMsgType.BINARY:
+                    response = ResponseParser.parse_response(msg.data)
+                    yield response
+                    
+                    if response.is_last_package or response.code != 0:
+                        break
+                elif msg.type == aiohttp.WSMsgType.ERROR:
+                    logger.error(f"WebSocket error: {msg.data}")
+                    break
+                elif msg.type == aiohttp.WSMsgType.CLOSED:
+                    logger.info("WebSocket connection closed")
+                    break
+        except Exception as e:
+            logger.error(f"Error receiving messages: {e}")
+            raise
+            
+    async def start_audio_stream(self, segment_size: int, content: bytes) -> AsyncGenerator[AsrResponse, None]:
+        async def sender():
+            async for _ in self.send_messages(segment_size, content):
+                pass
+                
+        # 启动发送和接收任务
+        sender_task = asyncio.create_task(sender())
+        
+        try:
+            async for response in self.recv_messages():
+                yield response
+        finally:
+            sender_task.cancel()
+            try:
+                await sender_task
+            except asyncio.CancelledError:
+                pass
+                
+    @staticmethod
+    def split_audio(data: bytes, segment_size: int) -> List[bytes]:
+        if segment_size <= 0:
+            return []
+            
+        segments = []
+        for i in range(0, len(data), segment_size):
+            end = i + segment_size
+            if end > len(data):
+                end = len(data)
+            segments.append(data[i:end])
+        return segments
+        
+    async def execute(self, file_path: str) -> AsyncGenerator[AsrResponse, None]:
+        if not file_path:
+            raise ValueError("File path is empty")
+            
+        if not self.url:
+            raise ValueError("URL is empty")
+            
+        self.seq = 1
+        
+        try:
+            # 1. 读取音频文件
+            content = await self.read_audio_data(file_path)
+            
+            # 2. 计算分段大小
+            segment_size = self.get_segment_size(content)
+            
+            # 3. 创建WebSocket连接
+            await self.create_connection()
+            
+            # 4. 发送完整客户端请求
+            await self.send_full_client_request()
+            
+            # 5. 启动音频流处理
+            async for response in self.start_audio_stream(segment_size, content):
+                yield response
+                
+        except Exception as e:
+            logger.error(f"Error in ASR execution: {e}")
+            raise
+        finally:
+            if self.conn:
+                await self.conn.close()
+
+async def main():
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="ASR WebSocket Client")
+    parser.add_argument("--file", type=str, required=True, help="Audio file path")
+
+    #wss://openspeech.bytedance.com/api/v3/sauc/bigmodel
+    #wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_async
+    #wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream
+    parser.add_argument("--url", type=str, default="wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream", 
+                       help="WebSocket URL")
+    parser.add_argument("--seg-duration", type=int, default=200, 
+                       help="Audio duration(ms) per packet, default:200")
+    
+    args = parser.parse_args()
+    
+    async with AsrWsClient(args.url, args.seg_duration) as client:  # 使用async with
+        try:
+            async for response in client.execute(args.file):
+                logger.info(f"Received response: {json.dumps(response.to_dict(), indent=2, ensure_ascii=False)}")
+        except Exception as e:
+            logger.error(f"ASR processing failed: {e}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
+
+    # 用法：
+    # python3 sauc_websocket_demo.py --file /Users/bytedance/code/python/eng_ddc_itn.wav
--- a/simple_wake_and_record.py
+++ b/simple_wake_and_record.py
@ -1,403 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-"""
-简化的唤醒+录音测试
-专注于解决音频冲突问题
-"""
-
-import sys
-import os
-import time
-import threading
-import pyaudio
-import json
-
-# 添加当前目录到路径
-sys.path.append(os.path.dirname(os.path.abspath(__file__)))
-
-try:
-    from vosk import Model, KaldiRecognizer
-    VOSK_AVAILABLE = True
-except ImportError:
-    VOSK_AVAILABLE = False
-    print("⚠️  Vosk 未安装，请运行: pip install vosk")
-
-class SimpleWakeAndRecord:
-    """简化的唤醒+录音系统"""
-    
-    def __init__(self, model_path="model", wake_words=["你好", "助手"]):
-        self.model_path = model_path
-        self.wake_words = wake_words
-        self.model = None
-        self.recognizer = None
-        self.audio = None
-        self.stream = None
-        self.running = False
-        
-        # 音频参数
-        self.FORMAT = pyaudio.paInt16
-        self.CHANNELS = 1
-        self.RATE = 16000
-        self.CHUNK_SIZE = 1024
-        
-        # 录音相关
-        self.recording = False
-        self.recorded_frames = []
-        self.last_text_time = None  # 最后一次识别到文字的时间
-        self.recording_start_time = None
-        self.recording_recognizer = None  # 录音时专用的识别器
-        
-        # 阈值
-        self.text_silence_threshold = 3.0  # 3秒没有识别到文字就结束
-        self.min_recording_time = 2.0  # 最小录音时间
-        self.max_recording_time = 30.0  # 最大录音时间
-        
-        self._setup_model()
-        self._setup_audio()
-    
-    def _setup_model(self):
-        """设置 Vosk 模型"""
-        if not VOSK_AVAILABLE:
-            return
-            
-        try:
-            if not os.path.exists(self.model_path):
-                print(f"模型路径不存在: {self.model_path}")
-                return
-            
-            self.model = Model(self.model_path)
-            self.recognizer = KaldiRecognizer(self.model, self.RATE)
-            self.recognizer.SetWords(True)
-            
-            print(f"✅ Vosk 模型加载成功")
-            
-        except Exception as e:
-            print(f"模型初始化失败: {e}")
-    
-    def _setup_audio(self):
-        """设置音频设备"""
-        try:
-            if self.audio is None:
-                self.audio = pyaudio.PyAudio()
-            
-            if self.stream is None:
-                self.stream = self.audio.open(
-                    format=self.FORMAT,
-                    channels=self.CHANNELS,
-                    rate=self.RATE,
-                    input=True,
-                    frames_per_buffer=self.CHUNK_SIZE
-                )
-            
-            print("✅ 音频设备初始化成功")
-            
-        except Exception as e:
-            print(f"音频设备初始化失败: {e}")
-    
-    def _calculate_energy(self, audio_data):
-        """计算音频能量"""
-        if len(audio_data) == 0:
-            return 0
-        
-        import numpy as np
-        audio_array = np.frombuffer(audio_data, dtype=np.int16)
-        rms = np.sqrt(np.mean(audio_array ** 2))
-        return rms
-    
-    def _check_wake_word(self, text):
-        """检查是否包含唤醒词"""
-        if not text or not self.wake_words:
-            return False, None
-        
-        text_lower = text.lower()
-        for wake_word in self.wake_words:
-            if wake_word.lower() in text_lower:
-                return True, wake_word
-        return False, None
-    
-    def _save_recording(self, audio_data):
-        """保存录音"""
-        timestamp = time.strftime("%Y%m%d_%H%M%S")
-        filename = f"recording_{timestamp}.wav"
-        
-        try:
-            import wave
-            with wave.open(filename, 'wb') as wf:
-                wf.setnchannels(self.CHANNELS)
-                wf.setsampwidth(self.audio.get_sample_size(self.FORMAT))
-                wf.setframerate(self.RATE)
-                wf.writeframes(audio_data)
-            
-            print(f"✅ 录音已保存: {filename}")
-            return True, filename
-        except Exception as e:
-            print(f"保存录音失败: {e}")
-            return False, None
-    
-    def _play_audio(self, filename):
-        """播放音频文件"""
-        try:
-            import wave
-            
-            # 打开音频文件
-            with wave.open(filename, 'rb') as wf:
-                # 获取音频参数
-                channels = wf.getnchannels()
-                width = wf.getsampwidth()
-                rate = wf.getframerate()
-                total_frames = wf.getnframes()
-                
-                # 分块读取音频数据，避免内存问题
-                chunk_size = 1024
-                frames = []
-                
-                for _ in range(0, total_frames, chunk_size):
-                    chunk = wf.readframes(chunk_size)
-                    if chunk:
-                        frames.append(chunk)
-                    else:
-                        break
-            
-            # 创建播放流
-            playback_stream = self.audio.open(
-                format=self.audio.get_format_from_width(width),
-                channels=channels,
-                rate=rate,
-                output=True
-            )
-            
-            print(f"🔊 开始播放: {filename}")
-            
-            # 分块播放音频
-            for chunk in frames:
-                playback_stream.write(chunk)
-            
-            # 等待播放完成
-            playback_stream.stop_stream()
-            playback_stream.close()
-            
-            print("✅ 播放完成")
-            
-        except Exception as e:
-            print(f"❌ 播放失败: {e}")
-            # 如果pyaudio播放失败，尝试用系统命令播放
-            self._play_with_system_player(filename)
-    
-    def _play_with_system_player(self, filename):
-        """使用系统播放器播放音频"""
-        try:
-            import platform
-            import subprocess
-            
-            system = platform.system()
-            
-            if system == 'Darwin':  # macOS
-                cmd = ['afplay', filename]
-            elif system == 'Windows':
-                cmd = ['start', '/min', filename]
-            else:  # Linux
-                cmd = ['aplay', filename]
-            
-            print(f"🔊 使用系统播放器: {' '.join(cmd)}")
-            subprocess.run(cmd, check=True)
-            print("✅ 播放完成")
-            
-        except Exception as e:
-            print(f"❌ 系统播放器也失败: {e}")
-            print(f"💡 文件已保存，请手动播放: {filename}")
-    
-    def _start_recording(self):
-        """开始录音"""
-        print("🎙️ 开始录音，请说话...")
-        self.recording = True
-        self.recorded_frames = []
-        self.last_text_time = None
-        self.recording_start_time = time.time()
-        
-        # 为录音创建一个新的识别器
-        if self.model:
-            self.recording_recognizer = KaldiRecognizer(self.model, self.RATE)
-            self.recording_recognizer.SetWords(True)
-    
-    def _stop_recording(self):
-        """停止录音"""
-        if len(self.recorded_frames) > 0:
-            audio_data = b''.join(self.recorded_frames)
-            duration = len(audio_data) / (self.RATE * 2)
-            print(f"📝 录音完成，时长: {duration:.2f}秒")
-            
-            # 保存录音
-            success, filename = self._save_recording(audio_data)
-            
-            # 如果保存成功，播放录音
-            if success and filename:
-                print("=" * 50)
-                print("🔊 播放刚才录制的音频...")
-                self._play_audio(filename)
-                print("=" * 50)
-        
-        self.recording = False
-        self.recorded_frames = []
-        self.last_text_time = None
-        self.recording_start_time = None
-        self.recording_recognizer = None
-    
-    def start(self):
-        """开始唤醒词检测和录音"""
-        if not self.stream:
-            print("❌ 音频设备未初始化")
-            return
-        
-        self.running = True
-        print("🎤 开始监听...")
-        print(f"唤醒词: {', '.join(self.wake_words)}")
-        
-        try:
-            while self.running:
-                # 读取音频数据
-                data = self.stream.read(self.CHUNK_SIZE, exception_on_overflow=False)
-                
-                if len(data) == 0:
-                    continue
-                
-                if self.recording:
-                    # 录音模式
-                    self.recorded_frames.append(data)
-                    recording_duration = time.time() - self.recording_start_time
-                    
-                    # 使用录音专用的识别器进行实时识别
-                    if self.recording_recognizer:
-                        if self.recording_recognizer.AcceptWaveform(data):
-                            # 获取最终识别结果
-                            result = json.loads(self.recording_recognizer.Result())
-                            text = result.get('text', '').strip()
-                            
-                            if text:
-                                # 识别到文字，更新时间戳
-                                self.last_text_time = time.time()
-                                print(f"\n📝 识别: {text}")
-                        else:
-                            # 获取部分识别结果
-                            partial_result = json.loads(self.recording_recognizer.PartialResult())
-                            partial_text = partial_result.get('partial', '').strip()
-                            
-                            if partial_text:
-                                # 更新时间戳（部分识别也算有声音）
-                                self.last_text_time = time.time()
-                                status = f"录音中... {recording_duration:.1f}s | {partial_text}"
-                                print(f"\r{status}", end='', flush=True)
-                    
-                    # 检查是否需要结束录音
-                    current_time = time.time()
-                    
-                    # 检查是否有文字识别超时
-                    if self.last_text_time is not None:
-                        text_silence_duration = current_time - self.last_text_time
-                        if text_silence_duration > self.text_silence_threshold and recording_duration >= self.min_recording_time:
-                            print(f"\n\n3秒没有识别到文字，结束录音")
-                            self._stop_recording()
-                    else:
-                        # 还没有识别到任何文字，检查是否超时
-                        if recording_duration > 5.0:  # 如果5秒还没识别到任何文字，也结束
-                            print(f"\n\n5秒没有识别到文字，结束录音")
-                            self._stop_recording()
-                    
-                    # 检查最大录音时间
-                    if recording_duration > self.max_recording_time:
-                        print(f"\n\n达到最大录音时间 {self.max_recording_time}s")
-                        self._stop_recording()
-                    
-                    # 显示录音状态
-                    if self.last_text_time is None:
-                        status = f"等待语音输入... {recording_duration:.1f}s"
-                        print(f"\r{status}", end='', flush=True)
-                
-                elif self.model and self.recognizer:
-                    # 唤醒词检测模式
-                    if self.recognizer.AcceptWaveform(data):
-                        result = json.loads(self.recognizer.Result())
-                        text = result.get('text', '').strip()
-                        
-                        if text:
-                            print(f"识别: {text}")
-                            
-                            # 检查唤醒词
-                            is_wake_word, detected_word = self._check_wake_word(text)
-                            if is_wake_word:
-                                print(f"🎯 检测到唤醒词: {detected_word}")
-                                self._start_recording()
-                    else:
-                        # 显示实时音频级别
-                        energy = self._calculate_energy(data)
-                        if energy > 50:  # 只显示有意义的音频级别
-                            partial_result = json.loads(self.recognizer.PartialResult())
-                            partial_text = partial_result.get('partial', '')
-                            if partial_text:
-                                status = f"监听中... 能量: {energy:.0f} | {partial_text}"
-                            else:
-                                status = f"监听中... 能量: {energy:.0f}"
-                            print(status, end='\r')
-                
-                time.sleep(0.01)
-                
-        except KeyboardInterrupt:
-            print("\n👋 退出")
-        except Exception as e:
-            print(f"错误: {e}")
-        finally:
-            self.stop()
-    
-    def stop(self):
-        """停止"""
-        self.running = False
-        if self.recording:
-            self._stop_recording()
-        
-        if self.stream:
-            self.stream.stop_stream()
-            self.stream.close()
-            self.stream = None
-        
-        if self.audio:
-            self.audio.terminate()
-            self.audio = None
-
-def main():
-    """主函数"""
-    print("🚀 简化唤醒+录音测试")
-    print("=" * 50)
-    
-    # 检查模型
-    model_dir = "model"
-    if not os.path.exists(model_dir):
-        print("⚠️  未找到模型目录")
-        print("请下载 Vosk 模型到 model 目录")
-        return
-    
-    # 创建系统
-    system = SimpleWakeAndRecord(
-        model_path=model_dir,
-        wake_words=["你好", "助手", "小爱"]
-    )
-    
-    if not system.model:
-        print("❌ 模型加载失败")
-        return
-    
-    print("✅ 系统初始化成功")
-    print("📖 使用说明:")
-    print("1. 说唤醒词开始录音")
-    print("2. 基于语音识别判断，3秒没有识别到文字就结束")
-    print("3. 最少录音2秒，最多30秒")
-    print("4. 录音时实时显示识别结果")
-    print("5. 录音文件自动保存")
-    print("6. 录音完成后自动播放刚才录制的内容")
-    print("7. 按 Ctrl+C 退出")
-    print("=" * 50)
-    
-    # 开始运行
-    system.start()
-
-if __name__ == "__main__":
-    main()
--- a/speech_recognizer.py
+++ b/speech_recognizer.py
@ -0,0 +1,532 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+语音识别模块
+基于 SAUC API 为录音文件提供语音识别功能
+"""
+
+import os
+import json
+import time
+import logging
+import asyncio
+import aiohttp
+import struct
+import gzip
+import uuid
+from typing import Optional, List, Dict, Any, AsyncGenerator
+from dataclasses import dataclass
+
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# 常量定义
+DEFAULT_SAMPLE_RATE = 16000
+
+class ProtocolVersion:
+    V1 = 0b0001
+
+class MessageType:
+    CLIENT_FULL_REQUEST = 0b0001
+    CLIENT_AUDIO_ONLY_REQUEST = 0b0010
+    SERVER_FULL_RESPONSE = 0b1001
+    SERVER_ERROR_RESPONSE = 0b1111
+
+class MessageTypeSpecificFlags:
+    NO_SEQUENCE = 0b0000
+    POS_SEQUENCE = 0b0001
+    NEG_SEQUENCE = 0b0010
+    NEG_WITH_SEQUENCE = 0b0011
+
+class SerializationType:
+    NO_SERIALIZATION = 0b0000
+    JSON = 0b0001
+
+class CompressionType:
+    GZIP = 0b0001
+
+@dataclass
+class RecognitionResult:
+    """语音识别结果"""
+    text: str
+    confidence: float
+    is_final: bool
+    start_time: Optional[float] = None
+    end_time: Optional[float] = None
+
+class AudioUtils:
+    """音频处理工具类"""
+    
+    @staticmethod
+    def gzip_compress(data: bytes) -> bytes:
+        """GZIP压缩"""
+        return gzip.compress(data)
+
+    @staticmethod
+    def gzip_decompress(data: bytes) -> bytes:
+        """GZIP解压缩"""
+        return gzip.decompress(data)
+
+    @staticmethod
+    def is_wav_file(data: bytes) -> bool:
+        """检查是否为WAV文件"""
+        if len(data) < 44:
+            return False
+        return data[:4] == b'RIFF' and data[8:12] == b'WAVE'
+
+    @staticmethod
+    def read_wav_info(data: bytes) -> tuple:
+        """读取WAV文件信息"""
+        if len(data) < 44:
+            raise ValueError("Invalid WAV file: too short")
+            
+        # 解析WAV头
+        chunk_id = data[:4]
+        if chunk_id != b'RIFF':
+            raise ValueError("Invalid WAV file: not RIFF format")
+            
+        format_ = data[8:12]
+        if format_ != b'WAVE':
+            raise ValueError("Invalid WAV file: not WAVE format")
+            
+        # 解析fmt子块
+        audio_format = struct.unpack('<H', data[20:22])[0]
+        num_channels = struct.unpack('<H', data[22:24])[0]
+        sample_rate = struct.unpack('<I', data[24:28])[0]
+        bits_per_sample = struct.unpack('<H', data[34:36])[0]
+        
+        # 查找data子块
+        pos = 36
+        while pos < len(data) - 8:
+            subchunk_id = data[pos:pos+4]
+            subchunk_size = struct.unpack('<I', data[pos+4:pos+8])[0]
+            if subchunk_id == b'data':
+                wave_data = data[pos+8:pos+8+subchunk_size]
+                return (
+                    num_channels,
+                    bits_per_sample // 8,
+                    sample_rate,
+                    subchunk_size // (num_channels * (bits_per_sample // 8)),
+                    wave_data
+                )
+            pos += 8 + subchunk_size
+            
+        raise ValueError("Invalid WAV file: no data subchunk found")
+
+class AsrConfig:
+    """ASR配置"""
+    
+    def __init__(self, app_key: str = None, access_key: str = None):
+        self.auth = {
+            "app_key": app_key or os.getenv("SAUC_APP_KEY", "your_app_key"),
+            "access_key": access_key or os.getenv("SAUC_ACCESS_KEY", "your_access_key")
+        }
+    
+    @property
+    def app_key(self) -> str:
+        return self.auth["app_key"]
+    
+    @property
+    def access_key(self) -> str:
+        return self.auth["access_key"]
+
+class AsrRequestHeader:
+    """ASR请求头"""
+    
+    def __init__(self):
+        self.message_type = MessageType.CLIENT_FULL_REQUEST
+        self.message_type_specific_flags = MessageTypeSpecificFlags.POS_SEQUENCE
+        self.serialization_type = SerializationType.JSON
+        self.compression_type = CompressionType.GZIP
+        self.reserved_data = bytes([0x00])
+
+    def with_message_type(self, message_type: int) -> 'AsrRequestHeader':
+        self.message_type = message_type
+        return self
+
+    def with_message_type_specific_flags(self, flags: int) -> 'AsrRequestHeader':
+        self.message_type_specific_flags = flags
+        return self
+
+    def with_serialization_type(self, serialization_type: int) -> 'AsrRequestHeader':
+        self.serialization_type = serialization_type
+        return self
+
+    def with_compression_type(self, compression_type: int) -> 'AsrRequestHeader':
+        self.compression_type = compression_type
+        return self
+
+    def with_reserved_data(self, reserved_data: bytes) -> 'AsrRequestHeader':
+        self.reserved_data = reserved_data
+        return self
+
+    def to_bytes(self) -> bytes:
+        header = bytearray()
+        header.append((ProtocolVersion.V1 << 4) | 1)
+        header.append((self.message_type << 4) | self.message_type_specific_flags)
+        header.append((self.serialization_type << 4) | self.compression_type)
+        header.extend(self.reserved_data)
+        return bytes(header)
+
+    @staticmethod
+    def default_header() -> 'AsrRequestHeader':
+        return AsrRequestHeader()
+
+class RequestBuilder:
+    """请求构建器"""
+    
+    @staticmethod
+    def new_auth_headers(config: AsrConfig) -> Dict[str, str]:
+        """创建认证头"""
+        reqid = str(uuid.uuid4())
+        return {
+            "X-Api-Resource-Id": "volc.bigasr.sauc.duration",
+            "X-Api-Request-Id": reqid,
+            "X-Api-Access-Key": config.access_key,
+            "X-Api-App-Key": config.app_key
+        }
+
+    @staticmethod
+    def new_full_client_request(seq: int) -> bytes:
+        """创建完整客户端请求"""
+        header = AsrRequestHeader.default_header() \
+            .with_message_type_specific_flags(MessageTypeSpecificFlags.POS_SEQUENCE)
+        
+        payload = {
+            "user": {
+                "uid": "local_voice_user"
+            },
+            "audio": {
+                "format": "wav",
+                "codec": "raw",
+                "rate": 16000,
+                "bits": 16,
+                "channel": 1
+            },
+            "request": {
+                "model_name": "bigmodel",
+                "enable_itn": True,
+                "enable_punc": True,
+                "enable_ddc": True,
+                "show_utterances": True,
+                "enable_nonstream": False
+            }
+        }
+        
+        payload_bytes = json.dumps(payload).encode('utf-8')
+        compressed_payload = AudioUtils.gzip_compress(payload_bytes)
+        payload_size = len(compressed_payload)
+        
+        request = bytearray()
+        request.extend(header.to_bytes())
+        request.extend(struct.pack('>i', seq))
+        request.extend(struct.pack('>U', payload_size))
+        request.extend(compressed_payload)
+        
+        return bytes(request)
+
+    @staticmethod
+    def new_audio_only_request(seq: int, segment: bytes, is_last: bool = False) -> bytes:
+        """创建纯音频请求"""
+        header = AsrRequestHeader.default_header()
+        if is_last:
+            header.with_message_type_specific_flags(MessageTypeSpecificFlags.NEG_WITH_SEQUENCE)
+            seq = -seq
+        else:
+            header.with_message_type_specific_flags(MessageTypeSpecificFlags.POS_SEQUENCE)
+        header.with_message_type(MessageType.CLIENT_AUDIO_ONLY_REQUEST)
+        
+        request = bytearray()
+        request.extend(header.to_bytes())
+        request.extend(struct.pack('>i', seq))
+        
+        compressed_segment = AudioUtils.gzip_compress(segment)
+        request.extend(struct.pack('>U', len(compressed_segment)))
+        request.extend(compressed_segment)
+        
+        return bytes(request)
+
+class AsrResponse:
+    """ASR响应"""
+    
+    def __init__(self):
+        self.code = 0
+        self.event = 0
+        self.is_last_package = False
+        self.payload_sequence = 0
+        self.payload_size = 0
+        self.payload_msg = None
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "code": self.code,
+            "event": self.event,
+            "is_last_package": self.is_last_package,
+            "payload_sequence": self.payload_sequence,
+            "payload_size": self.payload_size,
+            "payload_msg": self.payload_msg
+        }
+
+class ResponseParser:
+    """响应解析器"""
+    
+    @staticmethod
+    def parse_response(msg: bytes) -> AsrResponse:
+        """解析响应"""
+        response = AsrResponse()
+        
+        header_size = msg[0] & 0x0f
+        message_type = msg[1] >> 4
+        message_type_specific_flags = msg[1] & 0x0f
+        serialization_method = msg[2] >> 4
+        message_compression = msg[2] & 0x0f
+        
+        payload = msg[header_size*4:]
+        
+        # 解析message_type_specific_flags
+        if message_type_specific_flags & 0x01:
+            response.payload_sequence = struct.unpack('>i', payload[:4])[0]
+            payload = payload[4:]
+        if message_type_specific_flags & 0x02:
+            response.is_last_package = True
+        if message_type_specific_flags & 0x04:
+            response.event = struct.unpack('>i', payload[:4])[0]
+            payload = payload[4:]
+            
+        # 解析message_type
+        if message_type == MessageType.SERVER_FULL_RESPONSE:
+            response.payload_size = struct.unpack('>U', payload[:4])[0]
+            payload = payload[4:]
+        elif message_type == MessageType.SERVER_ERROR_RESPONSE:
+            response.code = struct.unpack('>i', payload[:4])[0]
+            response.payload_size = struct.unpack('>U', payload[4:8])[0]
+            payload = payload[8:]
+            
+        if not payload:
+            return response
+            
+        # 解压缩
+        if message_compression == CompressionType.GZIP:
+            try:
+                payload = AudioUtils.gzip_decompress(payload)
+            except Exception as e:
+                logger.error(f"Failed to decompress payload: {e}")
+                return response
+                
+        # 解析payload
+        try:
+            if serialization_method == SerializationType.JSON:
+                response.payload_msg = json.loads(payload.decode('utf-8'))
+        except Exception as e:
+            logger.error(f"Failed to parse payload: {e}")
+            
+        return response
+
+class SpeechRecognizer:
+    """语音识别器"""
+    
+    def __init__(self, app_key: str = None, access_key: str = None, 
+                 url: str = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_nostream"):
+        self.config = AsrConfig(app_key, access_key)
+        self.url = url
+        self.seq = 1
+        
+    async def recognize_file(self, file_path: str) -> List[RecognitionResult]:
+        """识别音频文件"""
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"Audio file not found: {file_path}")
+        
+        results = []
+        
+        try:
+            async with aiohttp.ClientSession() as session:
+                # 读取音频文件
+                with open(file_path, 'rb') as f:
+                    content = f.read()
+                
+                if not AudioUtils.is_wav_file(content):
+                    raise ValueError("Audio file must be in WAV format")
+                
+                # 获取音频信息
+                try:
+                    _, _, sample_rate, _, audio_data = AudioUtils.read_wav_info(content)
+                    if sample_rate != DEFAULT_SAMPLE_RATE:
+                        logger.warning(f"Sample rate {sample_rate} != {DEFAULT_SAMPLE_RATE}, may affect recognition accuracy")
+                except Exception as e:
+                    logger.error(f"Failed to read audio info: {e}")
+                    raise
+                
+                # 计算分段大小 (200ms per segment)
+                segment_size = 1 * 2 * DEFAULT_SAMPLE_RATE * 200 // 1000  # channel * bytes_per_sample * sample_rate * duration_ms / 1000
+                
+                # 创建WebSocket连接
+                headers = RequestBuilder.new_auth_headers(self.config)
+                async with session.ws_connect(self.url, headers=headers) as ws:
+                    
+                    # 发送完整客户端请求
+                    request = RequestBuilder.new_full_client_request(self.seq)
+                    self.seq += 1
+                    await ws.send_bytes(request)
+                    
+                    # 接收初始响应
+                    msg = await ws.receive()
+                    if msg.type == aiohttp.WSMsgType.BINARY:
+                        response = ResponseParser.parse_response(msg.data)
+                        logger.info(f"Initial response: {response.to_dict()}")
+                    
+                    # 分段发送音频数据
+                    audio_segments = self._split_audio(audio_data, segment_size)
+                    total_segments = len(audio_segments)
+                    
+                    for i, segment in enumerate(audio_segments):
+                        is_last = (i == total_segments - 1)
+                        request = RequestBuilder.new_audio_only_request(
+                            self.seq, 
+                            segment,
+                            is_last=is_last
+                        )
+                        await ws.send_bytes(request)
+                        logger.info(f"Sent audio segment {i+1}/{total_segments}")
+                        
+                        if not is_last:
+                            self.seq += 1
+                        
+                        # 短暂延迟模拟实时流
+                        await asyncio.sleep(0.1)
+                    
+                    # 接收识别结果
+                    final_text = ""
+                    while True:
+                        msg = await ws.receive()
+                        if msg.type == aiohttp.WSMsgType.BINARY:
+                            response = ResponseParser.parse_response(msg.data)
+                            
+                            if response.payload_msg and 'text' in response.payload_msg:
+                                text = response.payload_msg['text']
+                                if text:
+                                    final_text += text
+                                    
+                                    result = RecognitionResult(
+                                        text=text,
+                                        confidence=0.9,  # 默认置信度
+                                        is_final=response.is_last_package
+                                    )
+                                    results.append(result)
+                                    
+                                    logger.info(f"Recognized: {text}")
+                            
+                            if response.is_last_package or response.code != 0:
+                                break
+                                
+                        elif msg.type == aiohttp.WSMsgType.ERROR:
+                            logger.error(f"WebSocket error: {msg.data}")
+                            break
+                        elif msg.type == aiohttp.WSMsgType.CLOSED:
+                            logger.info("WebSocket connection closed")
+                            break
+                    
+                    # 如果没有获得最终结果，创建一个包含所有文本的结果
+                    if final_text and not any(r.is_final for r in results):
+                        final_result = RecognitionResult(
+                            text=final_text,
+                            confidence=0.9,
+                            is_final=True
+                        )
+                        results.append(final_result)
+                    
+                    return results
+                    
+        except Exception as e:
+            logger.error(f"Speech recognition failed: {e}")
+            raise
+    
+    def _split_audio(self, data: bytes, segment_size: int) -> List[bytes]:
+        """分割音频数据"""
+        if segment_size <= 0:
+            return []
+            
+        segments = []
+        for i in range(0, len(data), segment_size):
+            end = i + segment_size
+            if end > len(data):
+                end = len(data)
+            segments.append(data[i:end])
+        return segments
+    
+    async def recognize_latest_recording(self, directory: str = ".") -> Optional[RecognitionResult]:
+        """识别最新的录音文件"""
+        # 查找最新的录音文件
+        recording_files = [f for f in os.listdir(directory) if f.startswith('recording_') and f.endswith('.wav')]
+        
+        if not recording_files:
+            logger.warning("No recording files found")
+            return None
+        
+        # 按文件名排序（包含时间戳）
+        recording_files.sort(reverse=True)
+        latest_file = recording_files[0]
+        latest_path = os.path.join(directory, latest_file)
+        
+        logger.info(f"Recognizing latest recording: {latest_file}")
+        
+        try:
+            results = await self.recognize_file(latest_path)
+            if results:
+                # 返回最终的识别结果
+                final_results = [r for r in results if r.is_final]
+                if final_results:
+                    return final_results[-1]
+                else:
+                    # 如果没有标记为final的结果，返回最后一个
+                    return results[-1]
+        except Exception as e:
+            logger.error(f"Failed to recognize latest recording: {e}")
+        
+        return None
+
+async def main():
+    """测试函数"""
+    import argparse
+    
+    parser = argparse.ArgumentParser(description="语音识别测试")
+    parser.add_argument("--file", type=str, help="音频文件路径")
+    parser.add_argument("--latest", action="store_true", help="识别最新的录音文件")
+    parser.add_argument("--app-key", type=str, help="SAUC App Key")
+    parser.add_argument("--access-key", type=str, help="SAUC Access Key")
+    
+    args = parser.parse_args()
+    
+    recognizer = SpeechRecognizer(
+        app_key=args.app_key,
+        access_key=args.access_key
+    )
+    
+    try:
+        if args.latest:
+            result = await recognizer.recognize_latest_recording()
+            if result:
+                print(f"识别结果: {result.text}")
+                print(f"置信度: {result.confidence}")
+                print(f"最终结果: {result.is_final}")
+            else:
+                print("未能识别到语音内容")
+        elif args.file:
+            results = await recognizer.recognize_file(args.file)
+            for i, result in enumerate(results):
+                print(f"结果 {i+1}: {result.text}")
+                print(f"置信度: {result.confidence}")
+                print(f"最终结果: {result.is_final}")
+                print("-" * 40)
+        else:
+            print("请指定 --file 或 --latest 参数")
+            
+    except Exception as e:
+        print(f"识别失败: {e}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/voice_recorder.py
+++ b/voice_recorder.py
@ -1,344 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-"""
-语音录制模块
-基于pyaudio实现，支持语音活动检测(VAD)自动判断录音结束
-"""
-
-import pyaudio
-import wave
-import numpy as np
-import time
-import os
-import threading
-from collections import deque
-
-class VoiceRecorder:
-    """语音录制器，支持自动检测语音结束"""
-    
-    def __init__(self, 
-                 energy_threshold=500, 
-                 silence_threshold=1.0,
-                 min_recording_time=0.5,
-                 max_recording_time=10.0,
-                 sample_rate=16000,
-                 chunk_size=1024,
-                 defer_audio_init=False):
-        """
-        初始化录音器
-        
-        Args:
-            energy_threshold: 语音能量阈值
-            silence_threshold: 静音持续时间阈值（秒）
-            min_recording_time: 最小录音时间（秒）
-            max_recording_time: 最大录音时间（秒）
-            sample_rate: 采样率
-            chunk_size: 音频块大小
-            defer_audio_init: 是否延迟音频初始化
-        """
-        self.energy_threshold = energy_threshold
-        self.silence_threshold = silence_threshold
-        self.min_recording_time = min_recording_time
-        self.max_recording_time = max_recording_time
-        self.sample_rate = sample_rate
-        self.chunk_size = chunk_size
-        self.defer_audio_init = defer_audio_init
-        
-        # 音频参数
-        self.FORMAT = pyaudio.paInt16
-        self.CHANNELS = 1
-        
-        # 状态变量
-        self.audio = None
-        self.stream = None
-        self.recording = False
-        self.recorded_frames = []
-        
-        # 语音检测相关
-        self.silence_start_time = None
-        self.recording_start_time = None
-        self.audio_buffer = deque(maxlen=int(sample_rate / chunk_size * 2))  # 2秒缓冲
-        
-        # 回调函数
-        self.on_recording_complete = None
-        self.on_speech_detected = None
-        
-        if not defer_audio_init:
-            self._setup_audio()
-    
-    def _setup_audio(self):
-        """设置音频设备"""
-        try:
-            self.audio = pyaudio.PyAudio()
-            
-            # 获取默认输入设备信息
-            device_info = self.audio.get_default_input_device_info()
-            print(f"使用音频设备: {device_info['name']}")
-            
-        except Exception as e:
-            print(f"音频设备初始化失败: {e}")
-            raise
-    
-    def _calculate_energy(self, audio_data):
-        """计算音频能量"""
-        if len(audio_data) == 0:
-            return 0
-        
-        # 转换为numpy数组
-        audio_array = np.frombuffer(audio_data, dtype=np.int16)
-        
-        # 计算RMS能量
-        rms = np.sqrt(np.mean(audio_array ** 2))
-        return rms
-    
-    def _is_speech(self, audio_data):
-        """判断是否为语音"""
-        energy = self._calculate_energy(audio_data)
-        return energy > self.energy_threshold
-    
-    def _open_stream(self):
-        """打开音频流"""
-        if self.stream is not None:
-            return
-            
-        self.stream = self.audio.open(
-            format=self.FORMAT,
-            channels=self.CHANNELS,
-            rate=self.sample_rate,
-            input=True,
-            frames_per_buffer=self.chunk_size
-        )
-    
-    def _close_stream(self):
-        """关闭音频流"""
-        if self.stream:
-            self.stream.stop_stream()
-            self.stream.close()
-            self.stream = None
-    
-    def start_listening(self):
-        """开始监听语音"""
-        if self.recording:
-            print("正在录音中...")
-            return
-        
-        self._open_stream()
-        self.recording = True
-        self.recorded_frames = []
-        self.silence_start_time = None
-        self.recording_start_time = None
-        
-        print("开始监听语音...")
-        
-        # 在新线程中录音
-        recording_thread = threading.Thread(target=self._record_loop)
-        recording_thread.daemon = True
-        recording_thread.start()
-    
-    def _record_loop(self):
-        """录音循环"""
-        try:
-            while self.recording:
-                # 读取音频数据
-                data = self.stream.read(self.chunk_size, exception_on_overflow=False)
-                
-                if len(data) == 0:
-                    continue
-                
-                # 计算能量
-                energy = self._calculate_energy(data)
-                
-                # 添加到缓冲区
-                self.audio_buffer.append(data)
-                
-                # 检测语音活动
-                if energy > self.energy_threshold:
-                    # 检测到语音
-                    if self.recording_start_time is None:
-                        # 开始录音
-                        self.recording_start_time = time.time()
-                        self.silence_start_time = None
-                        self.recorded_frames = list(self.audio_buffer)  # 包含之前的音频
-                        
-                        print("🎤 检测到语音，开始录音...")
-                        
-                        if self.on_speech_detected:
-                            self.on_speech_detected()
-                    
-                    # 重置静音计时
-                    self.silence_start_time = None
-                    
-                    # 录音
-                    self.recorded_frames.append(data)
-                    
-                elif self.recording_start_time is not None:
-                    # 之前有语音，现在检查是否静音
-                    if self.silence_start_time is None:
-                        self.silence_start_time = time.time()
-                    
-                    # 继续录音
-                    self.recorded_frames.append(data)
-                    
-                    # 检查是否静音超时
-                    silence_duration = time.time() - self.silence_start_time
-                    if silence_duration > self.silence_threshold:
-                        recording_duration = time.time() - self.recording_start_time
-                        
-                        # 检查最小录音时间
-                        if recording_duration >= self.min_recording_time:
-                            print(f"静音 {silence_duration:.1f}s，结束录音")
-                            self.stop_recording()
-                            break
-                        else:
-                            print(f"录音时间太短 ({recording_duration:.1f}s)，继续等待...")
-                            self.silence_start_time = time.time()
-                
-                # 检查最大录音时间
-                if self.recording_start_time is not None:
-                    recording_duration = time.time() - self.recording_start_time
-                    if recording_duration > self.max_recording_time:
-                        print(f"达到最大录音时间 {self.max_recording_time}s，结束录音")
-                        self.stop_recording()
-                        break
-                
-                # 短暂休眠
-                time.sleep(0.01)
-                
-        except Exception as e:
-            print(f"录音过程中发生错误: {e}")
-            self.stop_recording()
-    
-    def stop_recording(self):
-        """停止录音"""
-        if not self.recording:
-            return
-        
-        self.recording = False
-        self._close_stream()
-        
-        if len(self.recorded_frames) > 0:
-            # 保存录音
-            audio_data = b''.join(self.recorded_frames)
-            
-            print(f"录音完成，共 {len(self.recorded_frames)} 帧")
-            print(f"录音时长: {len(audio_data) / (self.sample_rate * 2):.2f} 秒")
-            
-            # 调用回调函数
-            if self.on_recording_complete:
-                self.on_recording_complete(audio_data)
-        
-        # 重置状态
-        self.recorded_frames = []
-        self.silence_start_time = None
-        self.recording_start_time = None
-    
-    def save_audio(self, audio_data, filename):
-        """保存音频到文件"""
-        try:
-            with wave.open(filename, 'wb') as wf:
-                wf.setnchannels(self.CHANNELS)
-                wf.setsampwidth(self.audio.get_sample_size(self.FORMAT))
-                wf.setframerate(self.sample_rate)
-                wf.writeframes(audio_data)
-            
-            print(f"音频已保存到: {filename}")
-            return True
-        except Exception as e:
-            print(f"保存音频失败: {e}")
-            return False
-    
-    def set_recording_complete_callback(self, callback):
-        """设置录音完成回调函数"""
-        self.on_recording_complete = callback
-    
-    def set_speech_detected_callback(self, callback):
-        """设置语音检测回调函数"""
-        self.on_speech_detected = callback
-    
-    def adjust_sensitivity(self, energy_threshold=None, silence_threshold=None):
-        """调整灵敏度"""
-        if energy_threshold is not None:
-            self.energy_threshold = energy_threshold
-            print(f"能量阈值调整为: {energy_threshold}")
-        
-        if silence_threshold is not None:
-            self.silence_threshold = silence_threshold
-            print(f"静音阈值调整为: {silence_threshold}秒")
-    
-    def get_audio_level(self):
-        """获取当前音频级别"""
-        if len(self.audio_buffer) > 0:
-            latest_data = self.audio_buffer[-1]
-            return self._calculate_energy(latest_data)
-        return 0
-    
-    def cleanup(self):
-        """清理资源"""
-        self.stop_recording()
-        if self.audio:
-            self.audio.terminate()
-            self.audio = None
-
-def main():
-    """测试录音功能"""
-    print("🎙️ 语音录制测试")
-    print("=" * 50)
-    print("配置:")
-    print("- 能量阈值: 500")
-    print("- 静音阈值: 1.0秒")
-    print("- 最小录音时间: 0.5秒")
-    print("- 最大录音时间: 10秒")
-    print("=" * 50)
-    print("请说话测试录音功能...")
-    print("按 Ctrl+C 退出")
-    
-    def on_recording_complete(audio_data):
-        """录音完成回调"""
-        # 保存录音文件
-        timestamp = time.strftime("%Y%m%d_%H%M%S")
-        filename = f"recording_{timestamp}.wav"
-        
-        recorder.save_audio(audio_data, filename)
-        print(f"✅ 录音文件已保存: {filename}")
-        
-        # 显示录音信息
-        duration = len(audio_data) / (recorder.sample_rate * 2)
-        print(f"录音时长: {duration:.2f} 秒")
-    
-    def on_speech_detected():
-        """检测到语音回调"""
-        print("🔊 检测到语音活动...")
-    
-    # 创建录音器
-    recorder = VoiceRecorder(
-        energy_threshold=500,
-        silence_threshold=1.0,
-        min_recording_time=0.5,
-        max_recording_time=10.0
-    )
-    
-    # 设置回调
-    recorder.set_recording_complete_callback(on_recording_complete)
-    recorder.set_speech_detected_callback(on_speech_detected)
-    
-    try:
-        # 开始监听
-        recorder.start_listening()
-        
-        # 保持程序运行
-        while True:
-            time.sleep(0.1)
-            
-            # 显示当前音频级别（可选）
-            level = recorder.get_audio_level()
-            if level > 100:
-                print(f"当前音频级别: {level:.0f}", end='\r')
-    
-    except KeyboardInterrupt:
-        print("\n👋 退出录音测试")
-    finally:
-        recorder.cleanup()
-
-if __name__ == "__main__":
-    main()
Author	SHA1	Message	Date
朱潮	df9616b48a	config	2025-09-20 12:53:58 +08:00
朱潮	072bb0e6b0	彻底解决回声问题：实现设备分离播放 - 播放时完全关闭音频输入流 - 使用系统播放器(aplay)避免设备冲突 - 添加安全的播放方式play_audio_safe - 播放前后增加延迟确保设备状态切换 - 完全解决硬件串扰和声学回声问题	2025-09-20 12:20:30 +08:00
朱潮	e6341b8620	彻底解决播放时回声录制问题 - 播放开始时立即停止当前录音 - 清空预录音缓冲区和录音数据 - 播放期间完全跳过音频处理 - 播放结束后重置状态并恢复监听 - 添加清晰的状态提示信息	2025-09-20 12:15:04 +08:00
朱潮	48b99384b7	修复语音检测算法：解决背景噪音和能量计算问题 - 修复能量历史更新：只在非录音状态更新背景噪音 - 提高ZCR阈值：1000-4000范围更适合语音检测 - 优化动态阈值：背景噪音+50%提高敏感性 - 添加峰值能量计算和调试信息显示 - 解决语音影响背景噪音计算的问题	2025-09-20 12:08:40 +08:00
朱潮	c01e6ad1f6	添加播放状态检测避免回声录制 - 播放时暂停录音处理 - 显示播放状态提示 - 防止播放的音频被重新录制 - 避免产生回声问题	2025-09-20 12:00:01 +08:00
朱潮	918bfb24af	调整静音检测时间为3秒	2025-09-20 11:51:02 +08:00
朱潮	939a1721d6	添加预录音功能：解决录音开头丢失问题 - 实现2秒预录音环形缓冲区 - 检测到声音时自动包含前2秒音频 - 实时显示缓冲区使用状态 - 完美解决录音开头丢失问题 - 显示预录音时长信息	2025-09-20 11:44:34 +08:00
朱潮	12c79a5a53	升级声音检测算法：动态阈值+零交叉率 - 实现动态阈值调整（背景噪音+25%） - 添加零交叉率检测区分语音和噪音 - 优化灵敏度，适应50-70的能量范围 - 实时显示ZCR和背景能量值 - 大幅提高语音检测准确性	2025-09-20 11:39:56 +08:00
朱潮	b526328fe6	降低能量阈值到200以提高灵敏度	2025-09-20 11:38:19 +08:00
朱潮	2612ef5b46	修复Python缩进错误	2025-09-20 11:25:32 +08:00
朱潮	b87be1494d	基于能量检测的极简录音系统：彻底解决树莓派3B延迟问题 - 完全移除Vosk识别依赖，改用能量检测 - 基于RMS能量值判断声音开始/结束 - 自动调整能量阈值适应环境噪音 - 实时性能监控，极低CPU占用 - 预期延迟：<0.1秒（原10秒） - 支持自动播放录制的音频优化特点： - 8kHz采样率，1024块大小 - 自动阈值调整算法 - 静音检测1.5秒结束录音 - 最小录音2秒，最大30秒 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>	2025-09-20 11:19:08 +08:00
朱潮	70c42eca15	激进性能优化：大幅降低树莓派3B延迟 - 音频参数：8kHz采样率，4096块大小（4倍） - 激进模式：直接处理，跳过部分识别结果 - 缓冲优化：5个块缓冲区，0.2秒处理间隔 - 禁用词级识别：提升Vosk处理速度 - 实时延迟监控：显示音频处理延迟 - 预期效果：从10秒延迟降低到<1秒 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>	2025-09-20 11:07:54 +08:00
朱潮	eb099d827d	config	2025-09-20 10:53:56 +08:00