From 70c42eca1530d95eca582d754f9ec6d918b88e8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= <zhuchaowe@users.noreply.github.com>
Date: Sat, 20 Sep 2025 11:07:54 +0800
Subject: [PATCH] =?UTF-8?q?=E6=BF=80=E8=BF=9B=E6=80=A7=E8=83=BD=E4=BC=98?=
 =?UTF-8?q?=E5=8C=96=EF=BC=9A=E5=A4=A7=E5=B9=85=E9=99=8D=E4=BD=8E=E6=A0=91?=
 =?UTF-8?q?=E8=8E=93=E6=B4=BE3B=E5=BB=B6=E8=BF=9F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 音频参数：8kHz采样率，4096块大小（4倍）
- 激进模式：直接处理，跳过部分识别结果
- 缓冲优化：5个块缓冲区，0.2秒处理间隔
- 禁用词级识别：提升Vosk处理速度
- 实时延迟监控：显示音频处理延迟
- 预期效果：从10秒延迟降低到<1秒

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 simple_wake_and_record.py | 247 ++++++++++++++++++++++++++------------
 1 file changed, 173 insertions(+), 74 deletions(-)

diff --git a/simple_wake_and_record.py b/simple_wake_and_record.py
index 23d9c3e..2e38396 100644
--- a/simple_wake_and_record.py
+++ b/simple_wake_and_record.py
@@ -35,11 +35,12 @@ class SimpleWakeAndRecord:
         self.stream = None
         self.running = False
         
-        # 音频参数 - 优化为树莓派3B
+        # 音频参数 - 激进优化为树莓派3B
         self.FORMAT = pyaudio.paInt16
         self.CHANNELS = 1
         self.RATE = 8000  # 从16kHz降至8kHz，减少50%数据处理量
-        self.CHUNK_SIZE = 2048  # 增大块大小，减少处理次数
+        self.CHUNK_SIZE = 4096  # 进一步增大块大小到4KB，大幅减少处理次数
+        self.AGGRESSIVE_MODE = True  # 激进优化模式
         
         # 录音相关
         self.recording = False
@@ -48,18 +49,25 @@ class SimpleWakeAndRecord:
         self.recording_start_time = None
         self.recording_recognizer = None  # 录音时专用的识别器
         
-        # 性能优化相关
+        # 性能优化相关 - 激进优化
         self.audio_buffer = []  # 音频缓冲区
-        self.buffer_size = 10  # 缓冲区大小（块数）
+        self.buffer_size = 5  # 减小缓冲区大小，减少内存使用
         self.last_process_time = time.time()  # 上次处理时间
-        self.process_interval = 0.5  # 处理间隔（秒）
-        self.batch_process_size = 5  # 批处理大小
+        self.process_interval = 0.2  # 缩短处理间隔，提高响应速度
+        self.batch_process_size = 3  # 减少批处理大小，更快处理
+        self.skip_partial_results = True  # 跳过部分识别结果，只处理最终结果
         
         # 性能监控
         self.process_count = 0
         self.avg_process_time = 0
         self.last_monitor_time = time.time()
-        self.monitor_interval = 5.0  # 监控间隔（秒）
+        self.monitor_interval = 3.0  # 缩短监控间隔
+        
+        # 延迟监控
+        self.audio_receive_times = []  # 音频接收时间戳
+        self.process_start_times = []  # 处理开始时间
+        self.latency_samples = []  # 延迟样本
+        self.max_latency_samples = 10  # 最大延迟样本数
         
         # 阈值
         self.text_silence_threshold = 3.0  # 3秒没有识别到文字就结束
@@ -79,11 +87,21 @@ class SimpleWakeAndRecord:
                 print(f"模型路径不存在: {self.model_path}")
                 return
             
+            print(f"🔄 正在加载模型，这可能需要一些时间...")
+            start_time = time.time()
+            
             self.model = Model(self.model_path)
             self.recognizer = KaldiRecognizer(self.model, self.RATE)
-            self.recognizer.SetWords(True)
             
-            print(f"✅ Vosk 模型加载成功")
+            # 激进模式：禁用词级识别以提高性能
+            if self.AGGRESSIVE_MODE:
+                self.recognizer.SetWords(False)
+                print(f"📉 激进模式：已禁用词级识别以提高性能")
+            else:
+                self.recognizer.SetWords(True)
+            
+            load_time = time.time() - start_time
+            print(f"✅ Vosk 模型加载成功 (耗时: {load_time:.2f}s)")
             
         except Exception as e:
             print(f"模型初始化失败: {e}")
@@ -142,6 +160,7 @@ class SimpleWakeAndRecord:
         
         # 记录处理开始时间
         start_time = time.time()
+        self.process_start_times.append(start_time)
         
         # 取出批处理数据
         batch_data = self.audio_buffer[:self.batch_process_size]
@@ -168,9 +187,48 @@ class SimpleWakeAndRecord:
         current_time = time.time()
         if current_time - self.last_monitor_time >= self.monitor_interval:
             buffer_usage = len(self.audio_buffer) / self.buffer_size * 100
-            print(f"\n📊 性能监控 | 处理次数: {self.process_count} | 平均处理时间: {self.avg_process_time:.3f}s | 缓冲区使用: {buffer_usage:.1f}%")
+            
+            # 计算平均延迟
+            avg_latency = 0
+            if self.latency_samples:
+                avg_latency = sum(self.latency_samples) / len(self.latency_samples)
+            
+            print(f"\n📊 性能监控 | 处理次数: {self.process_count} | 平均处理时间: {self.avg_process_time:.3f}s | 缓冲区使用: {buffer_usage:.1f}% | 平均延迟: {avg_latency:.2f}s")
             self.last_monitor_time = current_time
     
+    def _calculate_latency(self, audio_time):
+        """计算音频延迟"""
+        current_time = time.time()
+        latency = current_time - audio_time
+        
+        # 添加到延迟样本
+        self.latency_samples.append(latency)
+        if len(self.latency_samples) > self.max_latency_samples:
+            self.latency_samples.pop(0)
+        
+        return latency
+    
+    def _lightweight_recognition(self, recognizer, audio_data):
+        """轻量级识别处理"""
+        if not recognizer:
+            return None
+        
+        # 激进模式：跳过部分识别结果，只处理最终结果
+        if self.skip_partial_results:
+            if recognizer.AcceptWaveform(audio_data):
+                result = json.loads(recognizer.Result())
+                return result.get('text', '').strip()
+        else:
+            # 标准模式：处理部分和最终结果
+            if recognizer.AcceptWaveform(audio_data):
+                result = json.loads(recognizer.Result())
+                return result.get('text', '').strip()
+            else:
+                partial_result = json.loads(recognizer.PartialResult())
+                return partial_result.get('partial', '').strip()
+        
+        return None
+    
     def _save_recording(self, audio_data):
         """保存录音"""
         timestamp = time.strftime("%Y%m%d_%H%M%S")
@@ -273,7 +331,11 @@ class SimpleWakeAndRecord:
         # 为录音创建一个新的识别器
         if self.model:
             self.recording_recognizer = KaldiRecognizer(self.model, self.RATE)
-            self.recording_recognizer.SetWords(True)
+            # 激进模式：禁用词级识别以提高性能
+            if self.AGGRESSIVE_MODE:
+                self.recording_recognizer.SetWords(False)
+            else:
+                self.recording_recognizer.SetWords(True)
     
     def _stop_recording(self):
         """停止录音"""
@@ -311,13 +373,19 @@ class SimpleWakeAndRecord:
         try:
             while self.running:
                 # 读取音频数据
+                receive_time = time.time()
                 data = self.stream.read(self.CHUNK_SIZE, exception_on_overflow=False)
                 
                 if len(data) == 0:
                     continue
                 
+                # 记录音频接收时间
+                self.audio_receive_times.append(receive_time)
+                if len(self.audio_receive_times) > self.max_latency_samples:
+                    self.audio_receive_times.pop(0)
+                
                 if self.recording:
-                    # 录音模式 - 直接处理
+                    # 录音模式 - 激进优化处理
                     self.recorded_frames.append(data)
                     recording_duration = time.time() - self.recording_start_time
                     
@@ -328,33 +396,45 @@ class SimpleWakeAndRecord:
                     if len(self.audio_buffer) > self.buffer_size:
                         self.audio_buffer.pop(0)
                     
-                    # 批处理识别
-                    if self._should_process_audio() and self.recording_recognizer:
-                        combined_data = self._process_audio_batch()
-                        if combined_data and self.recording_recognizer.AcceptWaveform(combined_data):
-                            # 获取最终识别结果
-                            result = json.loads(self.recording_recognizer.Result())
-                            text = result.get('text', '').strip()
+                    # 激进模式：直接处理，不等待批处理
+                    if self.AGGRESSIVE_MODE and self.recording_recognizer:
+                        # 直接处理当前音频块
+                        text = self._lightweight_recognition(self.recording_recognizer, data)
+                        if text:
+                            # 计算延迟
+                            if self.audio_receive_times:
+                                latency = self._calculate_latency(self.audio_receive_times[0])
+                                self.audio_receive_times.pop(0)
                             
-                            if text:
-                                # 识别到文字，更新时间戳
-                                self.last_text_time = time.time()
-                                print(f"\n📝 识别: {text}")
-                        elif combined_data:
-                            # 获取部分识别结果
-                            partial_result = json.loads(self.recording_recognizer.PartialResult())
-                            partial_text = partial_result.get('partial', '').strip()
-                            
-                            if partial_text:
-                                # 更新时间戳（部分识别也算有声音）
-                                self.last_text_time = time.time()
-                                status = f"录音中... {recording_duration:.1f}s | {partial_text}"
-                                print(f"\r{status}", end='', flush=True)
+                            # 识别到文字，更新时间戳
+                            self.last_text_time = time.time()
+                            print(f"\n📝 识别: {text} (延迟: {latency:.2f}s)")
+                    else:
+                        # 标准批处理模式
+                        if self._should_process_audio() and self.recording_recognizer:
+                            combined_data = self._process_audio_batch()
+                            if combined_data:
+                                text = self._lightweight_recognition(self.recording_recognizer, combined_data)
+                                if text:
+                                    # 计算延迟
+                                    if self.process_start_times:
+                                        process_start = self.process_start_times[0]
+                                        self.process_start_times.pop(0)
+                                        if self.audio_receive_times:
+                                            audio_time = self.audio_receive_times[0]
+                                            self.audio_receive_times.pop(0)
+                                            latency = process_start - audio_time
+                                            self._calculate_latency(audio_time)
+                                    
+                                    self.last_text_time = time.time()
+                                    print(f"\n📝 识别: {text}")
                     
                     # 检查是否需要结束录音
                     current_time = time.time()
                     
-                    # 检查是否有文字识别超时
+                    # 激进模式：缩短超时时间
+                    timeout_duration = 2.0 if self.AGGRESSIVE_MODE else 5.0
+                    
                     if self.last_text_time is not None:
                         text_silence_duration = current_time - self.last_text_time
                         if text_silence_duration > self.text_silence_threshold and recording_duration >= self.min_recording_time:
@@ -362,8 +442,8 @@ class SimpleWakeAndRecord:
                             self._stop_recording()
                     else:
                         # 还没有识别到任何文字，检查是否超时
-                        if recording_duration > 5.0:  # 如果5秒还没识别到任何文字，也结束
-                            print(f"\n\n5秒没有识别到文字，结束录音")
+                        if recording_duration > timeout_duration:
+                            print(f"\n\n{timeout_duration}秒没有识别到文字，结束录音")
                             self._stop_recording()
                     
                     # 检查最大录音时间
@@ -377,41 +457,55 @@ class SimpleWakeAndRecord:
                         print(f"\r{status}", end='', flush=True)
                 
                 elif self.model and self.recognizer:
-                    # 唤醒词检测模式 - 使用批处理
-                    self.audio_buffer.append(data)
-                    
-                    # 限制缓冲区大小
-                    if len(self.audio_buffer) > self.buffer_size:
-                        self.audio_buffer.pop(0)
-                    
-                    # 批处理识别
-                    if self._should_process_audio():
-                        combined_data = self._process_audio_batch()
-                        if combined_data and self.recognizer.AcceptWaveform(combined_data):
-                            result = json.loads(self.recognizer.Result())
-                            text = result.get('text', '').strip()
+                    # 唤醒词检测模式 - 激进优化
+                    if self.AGGRESSIVE_MODE:
+                        # 直接处理，不使用缓冲区
+                        text = self._lightweight_recognition(self.recognizer, data)
+                        if text:
+                            print(f"识别: {text}")
                             
-                            if text:
-                                print(f"识别: {text}")
-                                
-                                # 检查唤醒词
-                                is_wake_word, detected_word = self._check_wake_word(text)
-                                if is_wake_word:
-                                    print(f"🎯 检测到唤醒词: {detected_word}")
-                                    self._start_recording()
-                        else:
-                            # 显示实时音频级别
-                            energy = self._calculate_energy(data)
-                            if energy > 50:  # 只显示有意义的音频级别
-                                partial_result = json.loads(self.recognizer.PartialResult())
-                                partial_text = partial_result.get('partial', '')
-                                if partial_text:
-                                    status = f"监听中... 能量: {energy:.0f} | {partial_text}"
-                                else:
-                                    status = f"监听中... 能量: {energy:.0f}"
-                                print(status, end='\r')
+                            # 检查唤醒词
+                            is_wake_word, detected_word = self._check_wake_word(text)
+                            if is_wake_word:
+                                print(f"🎯 检测到唤醒词: {detected_word}")
+                                self._start_recording()
+                        
+                        # 显示实时音频级别（仅在高能量时）
+                        energy = self._calculate_energy(data)
+                        if energy > 100:  # 提高阈值，减少显示频率
+                            status = f"监听中... 能量: {energy:.0f}"
+                            print(status, end='\r')
+                    else:
+                        # 标准批处理模式
+                        self.audio_buffer.append(data)
+                        
+                        # 限制缓冲区大小
+                        if len(self.audio_buffer) > self.buffer_size:
+                            self.audio_buffer.pop(0)
+                        
+                        # 批处理识别
+                        if self._should_process_audio():
+                            combined_data = self._process_audio_batch()
+                            if combined_data:
+                                text = self._lightweight_recognition(self.recognizer, combined_data)
+                                if text:
+                                    print(f"识别: {text}")
+                                    
+                                    # 检查唤醒词
+                                    is_wake_word, detected_word = self._check_wake_word(text)
+                                    if is_wake_word:
+                                        print(f"🎯 检测到唤醒词: {detected_word}")
+                                        self._start_recording()
+                        
+                        # 显示实时音频级别
+                        energy = self._calculate_energy(data)
+                        if energy > 50:
+                            status = f"监听中... 能量: {energy:.0f}"
+                            print(status, end='\r')
                 
-                time.sleep(0.05)  # 增加延迟，减少CPU使用
+                # 激进模式：更长的延迟以减少CPU使用
+                sleep_time = 0.1 if self.AGGRESSIVE_MODE else 0.05
+                time.sleep(sleep_time)
                 
         except KeyboardInterrupt:
             print("\n👋 退出")
@@ -466,12 +560,17 @@ def main():
     print("5. 录音文件自动保存")
     print("6. 录音完成后自动播放刚才录制的内容")
     print("7. 按 Ctrl+C 退出")
-    print("🚀 性能优化已启用:")
+    print("🚀 激进性能优化已启用:")
     print("   - 采样率: 8kHz (降低50%数据量)")
-    print("   - 批处理: 5个音频块/次")
-    print("   - 处理间隔: 0.5秒")
-    print("   - 缓冲区: 10个音频块")
-    print("   - 性能监控: 每5秒显示")
+    print("   - 块大小: 4096字节 (4倍于原始大小)")
+    print("   - 激进模式: 已启用 (直接处理，跳过部分结果)")
+    print("   - 批处理: 3个音频块/次")
+    print("   - 处理间隔: 0.2秒")
+    print("   - 缓冲区: 5个音频块")
+    print("   - 词级识别: 已禁用 (提高性能)")
+    print("   - 性能监控: 每3秒显示")
+    print("   - 延迟监控: 实时显示")
+    print("   - 预期延迟: <1秒 (原10秒)")
     print("=" * 50)
     
     # 开始运行