Local-Voice/simple_wake_and_record.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
简化的唤醒+录音测试
专注于解决音频冲突问题
"""

import sys
import os
import time
import threading
import pyaudio
import json

# 添加当前目录到路径
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

try:
    from vosk import Model, KaldiRecognizer
    VOSK_AVAILABLE = True
except ImportError:
    VOSK_AVAILABLE = False
    print("⚠️  Vosk 未安装，请运行: pip install vosk")

class SimpleWakeAndRecord:
    """简化的唤醒+录音系统"""

    def __init__(self, model_path="model", wake_words=["你好", "助手"]):
        self.model_path = model_path
        self.wake_words = wake_words
        self.model = None
        self.recognizer = None
        self.audio = None
        self.stream = None
        self.running = False

        # 音频参数 - 激进优化为树莓派3B
        self.FORMAT = pyaudio.paInt16
        self.CHANNELS = 1
        self.RATE = 8000  # 从16kHz降至8kHz，减少50%数据处理量
        self.CHUNK_SIZE = 4096  # 进一步增大块大小到4KB，大幅减少处理次数
        self.AGGRESSIVE_MODE = True  # 激进优化模式

        # 录音相关
        self.recording = False
        self.recorded_frames = []
        self.last_text_time = None  # 最后一次识别到文字的时间
        self.recording_start_time = None
        self.recording_recognizer = None  # 录音时专用的识别器

        # 性能优化相关 - 激进优化
        self.audio_buffer = []  # 音频缓冲区
        self.buffer_size = 5  # 减小缓冲区大小，减少内存使用
        self.last_process_time = time.time()  # 上次处理时间
        self.process_interval = 0.2  # 缩短处理间隔，提高响应速度
        self.batch_process_size = 3  # 减少批处理大小，更快处理
        self.skip_partial_results = True  # 跳过部分识别结果，只处理最终结果

        # 性能监控
        self.process_count = 0
        self.avg_process_time = 0
        self.last_monitor_time = time.time()
        self.monitor_interval = 3.0  # 缩短监控间隔

        # 延迟监控
        self.audio_receive_times = []  # 音频接收时间戳
        self.process_start_times = []  # 处理开始时间
        self.latency_samples = []  # 延迟样本
        self.max_latency_samples = 10  # 最大延迟样本数

        # 阈值
        self.text_silence_threshold = 3.0  # 3秒没有识别到文字就结束
        self.min_recording_time = 2.0  # 最小录音时间
        self.max_recording_time = 30.0  # 最大录音时间

        self._setup_model()
        self._setup_audio()

    def _setup_model(self):
        """设置 Vosk 模型"""
        if not VOSK_AVAILABLE:
            return

        try:
            if not os.path.exists(self.model_path):
                print(f"模型路径不存在: {self.model_path}")
                return

            print(f"🔄 正在加载模型，这可能需要一些时间...")
            start_time = time.time()

            self.model = Model(self.model_path)
            self.recognizer = KaldiRecognizer(self.model, self.RATE)

            # 激进模式：禁用词级识别以提高性能
            if self.AGGRESSIVE_MODE:
                self.recognizer.SetWords(False)
                print(f"📉 激进模式：已禁用词级识别以提高性能")
            else:
                self.recognizer.SetWords(True)

            load_time = time.time() - start_time
            print(f"✅ Vosk 模型加载成功 (耗时: {load_time:.2f}s)")

        except Exception as e:
            print(f"模型初始化失败: {e}")

    def _setup_audio(self):
        """设置音频设备"""
        try:
            if self.audio is None:
                self.audio = pyaudio.PyAudio()

            if self.stream is None:
                self.stream = self.audio.open(
                    format=self.FORMAT,
                    channels=self.CHANNELS,
                    rate=self.RATE,
                    input=True,
                    frames_per_buffer=self.CHUNK_SIZE
                )

            print("✅ 音频设备初始化成功")

        except Exception as e:
            print(f"音频设备初始化失败: {e}")

    def _calculate_energy(self, audio_data):
        """计算音频能量"""
        if len(audio_data) == 0:
            return 0

        import numpy as np
        audio_array = np.frombuffer(audio_data, dtype=np.int16)
        rms = np.sqrt(np.mean(audio_array ** 2))
        return rms

    def _check_wake_word(self, text):
        """检查是否包含唤醒词"""
        if not text or not self.wake_words:
            return False, None

        text_lower = text.lower()
        for wake_word in self.wake_words:
            if wake_word.lower() in text_lower:
                return True, wake_word
        return False, None

    def _should_process_audio(self):
        """判断是否应该处理音频"""
        current_time = time.time()
        return (current_time - self.last_process_time >= self.process_interval and
                len(self.audio_buffer) >= self.batch_process_size)

    def _process_audio_batch(self):
        """批量处理音频数据"""
        if len(self.audio_buffer) < self.batch_process_size:
            return

        # 记录处理开始时间
        start_time = time.time()
        self.process_start_times.append(start_time)

        # 取出批处理数据
        batch_data = self.audio_buffer[:self.batch_process_size]
        self.audio_buffer = self.audio_buffer[self.batch_process_size:]

        # 合并音频数据
        combined_data = b''.join(batch_data)

        # 更新处理时间
        self.last_process_time = time.time()

        # 更新性能统计
        process_time = time.time() - start_time
        self.process_count += 1
        self.avg_process_time = (self.avg_process_time * (self.process_count - 1) + process_time) / self.process_count

        # 性能监控
        self._monitor_performance()

        return combined_data

    def _monitor_performance(self):
        """性能监控"""
        current_time = time.time()
        if current_time - self.last_monitor_time >= self.monitor_interval:
            buffer_usage = len(self.audio_buffer) / self.buffer_size * 100

            # 计算平均延迟
            avg_latency = 0
            if self.latency_samples:
                avg_latency = sum(self.latency_samples) / len(self.latency_samples)

            print(f"\n📊 性能监控 | 处理次数: {self.process_count} | 平均处理时间: {self.avg_process_time:.3f}s | 缓冲区使用: {buffer_usage:.1f}% | 平均延迟: {avg_latency:.2f}s")
            self.last_monitor_time = current_time

    def _calculate_latency(self, audio_time):
        """计算音频延迟"""
        current_time = time.time()
        latency = current_time - audio_time

        # 添加到延迟样本
        self.latency_samples.append(latency)
        if len(self.latency_samples) > self.max_latency_samples:
            self.latency_samples.pop(0)

        return latency

    def _lightweight_recognition(self, recognizer, audio_data):
        """轻量级识别处理"""
        if not recognizer:
            return None

        # 激进模式：跳过部分识别结果，只处理最终结果
        if self.skip_partial_results:
            if recognizer.AcceptWaveform(audio_data):
                result = json.loads(recognizer.Result())
                return result.get('text', '').strip()
        else:
            # 标准模式：处理部分和最终结果
            if recognizer.AcceptWaveform(audio_data):
                result = json.loads(recognizer.Result())
                return result.get('text', '').strip()
            else:
                partial_result = json.loads(recognizer.PartialResult())
                return partial_result.get('partial', '').strip()

        return None

    def _save_recording(self, audio_data):
        """保存录音"""
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        filename = f"recording_{timestamp}.wav"

        try:
            import wave
            with wave.open(filename, 'wb') as wf:
                wf.setnchannels(self.CHANNELS)
                wf.setsampwidth(self.audio.get_sample_size(self.FORMAT))
                wf.setframerate(self.RATE)
                wf.writeframes(audio_data)

            print(f"✅ 录音已保存: {filename}")
            return True, filename
        except Exception as e:
            print(f"保存录音失败: {e}")
            return False, None

    def _play_audio(self, filename):
        """播放音频文件"""
        try:
            import wave

            # 打开音频文件
            with wave.open(filename, 'rb') as wf:
                # 获取音频参数
                channels = wf.getnchannels()
                width = wf.getsampwidth()
                rate = wf.getframerate()
                total_frames = wf.getnframes()

                # 分块读取音频数据，避免内存问题
                chunk_size = 1024
                frames = []

                for _ in range(0, total_frames, chunk_size):
                    chunk = wf.readframes(chunk_size)
                    if chunk:
                        frames.append(chunk)
                    else:
                        break

            # 创建播放流
            playback_stream = self.audio.open(
                format=self.audio.get_format_from_width(width),
                channels=channels,
                rate=rate,
                output=True
            )

            print(f"🔊 开始播放: {filename}")

            # 分块播放音频
            for chunk in frames:
                playback_stream.write(chunk)

            # 等待播放完成
            playback_stream.stop_stream()
            playback_stream.close()

            print("✅ 播放完成")

        except Exception as e:
            print(f"❌ 播放失败: {e}")
            # 如果pyaudio播放失败，尝试用系统命令播放
            self._play_with_system_player(filename)

    def _play_with_system_player(self, filename):
        """使用系统播放器播放音频"""
        try:
            import platform
            import subprocess

            system = platform.system()

            if system == 'Darwin':  # macOS
                cmd = ['afplay', filename]
            elif system == 'Windows':
                cmd = ['start', '/min', filename]
            else:  # Linux
                cmd = ['aplay', filename]

            print(f"🔊 使用系统播放器: {' '.join(cmd)}")
            subprocess.run(cmd, check=True)
            print("✅ 播放完成")

        except Exception as e:
            print(f"❌ 系统播放器也失败: {e}")
            print(f"💡 文件已保存，请手动播放: {filename}")

    def _start_recording(self):
        """开始录音"""
        print("🎙️ 开始录音，请说话...")
        self.recording = True
        self.recorded_frames = []
        self.last_text_time = None
        self.recording_start_time = time.time()

        # 为录音创建一个新的识别器
        if self.model:
            self.recording_recognizer = KaldiRecognizer(self.model, self.RATE)
            # 激进模式：禁用词级识别以提高性能
            if self.AGGRESSIVE_MODE:
                self.recording_recognizer.SetWords(False)
            else:
                self.recording_recognizer.SetWords(True)

    def _stop_recording(self):
        """停止录音"""
        if len(self.recorded_frames) > 0:
            audio_data = b''.join(self.recorded_frames)
            duration = len(audio_data) / (self.RATE * 2)
            print(f"📝 录音完成，时长: {duration:.2f}秒")

            # 保存录音
            success, filename = self._save_recording(audio_data)

            # 如果保存成功，播放录音
            if success and filename:
                print("=" * 50)
                print("🔊 播放刚才录制的音频...")
                self._play_audio(filename)
                print("=" * 50)

        self.recording = False
        self.recorded_frames = []
        self.last_text_time = None
        self.recording_start_time = None
        self.recording_recognizer = None

    def start(self):
        """开始唤醒词检测和录音"""
        if not self.stream:
            print("❌ 音频设备未初始化")
            return

        self.running = True
        print("🎤 开始监听...")
        print(f"唤醒词: {', '.join(self.wake_words)}")

        try:
            while self.running:
                # 读取音频数据
                receive_time = time.time()
                data = self.stream.read(self.CHUNK_SIZE, exception_on_overflow=False)

                if len(data) == 0:
                    continue

                # 记录音频接收时间
                self.audio_receive_times.append(receive_time)
                if len(self.audio_receive_times) > self.max_latency_samples:
                    self.audio_receive_times.pop(0)

                if self.recording:
                    # 录音模式 - 激进优化处理
                    self.recorded_frames.append(data)
                    recording_duration = time.time() - self.recording_start_time

                    # 录音时使用批处理进行识别
                    self.audio_buffer.append(data)

                    # 限制缓冲区大小
                    if len(self.audio_buffer) > self.buffer_size:
                        self.audio_buffer.pop(0)

                    # 激进模式：直接处理，不等待批处理
                    if self.AGGRESSIVE_MODE and self.recording_recognizer:
                        # 直接处理当前音频块
                        text = self._lightweight_recognition(self.recording_recognizer, data)
                        if text:
                            # 计算延迟
                            if self.audio_receive_times:
                                latency = self._calculate_latency(self.audio_receive_times[0])
                                self.audio_receive_times.pop(0)

                            # 识别到文字，更新时间戳
                            self.last_text_time = time.time()
                            print(f"\n📝 识别: {text} (延迟: {latency:.2f}s)")
                    else:
                        # 标准批处理模式
                        if self._should_process_audio() and self.recording_recognizer:
                            combined_data = self._process_audio_batch()
                            if combined_data:
                                text = self._lightweight_recognition(self.recording_recognizer, combined_data)
                                if text:
                                    # 计算延迟
                                    if self.process_start_times:
                                        process_start = self.process_start_times[0]
                                        self.process_start_times.pop(0)
                                        if self.audio_receive_times:
                                            audio_time = self.audio_receive_times[0]
                                            self.audio_receive_times.pop(0)
                                            latency = process_start - audio_time
                                            self._calculate_latency(audio_time)

                                    self.last_text_time = time.time()
                                    print(f"\n📝 识别: {text}")

                    # 检查是否需要结束录音
                    current_time = time.time()

                    # 激进模式：缩短超时时间
                    timeout_duration = 2.0 if self.AGGRESSIVE_MODE else 5.0

                    if self.last_text_time is not None:
                        text_silence_duration = current_time - self.last_text_time
                        if text_silence_duration > self.text_silence_threshold and recording_duration >= self.min_recording_time:
                            print(f"\n\n3秒没有识别到文字，结束录音")
                            self._stop_recording()
                    else:
                        # 还没有识别到任何文字，检查是否超时
                        if recording_duration > timeout_duration:
                            print(f"\n\n{timeout_duration}秒没有识别到文字，结束录音")
                            self._stop_recording()

                    # 检查最大录音时间
                    if recording_duration > self.max_recording_time:
                        print(f"\n\n达到最大录音时间 {self.max_recording_time}s")
                        self._stop_recording()

                    # 显示录音状态
                    if self.last_text_time is None:
                        status = f"等待语音输入... {recording_duration:.1f}s"
                        print(f"\r{status}", end='', flush=True)

                elif self.model and self.recognizer:
                    # 唤醒词检测模式 - 激进优化
                    if self.AGGRESSIVE_MODE:
                        # 直接处理，不使用缓冲区
                        text = self._lightweight_recognition(self.recognizer, data)
                        if text:
                            print(f"识别: {text}")

                            # 检查唤醒词
                            is_wake_word, detected_word = self._check_wake_word(text)
                            if is_wake_word:
                                print(f"🎯 检测到唤醒词: {detected_word}")
                                self._start_recording()

                        # 显示实时音频级别（仅在高能量时）
                        energy = self._calculate_energy(data)
                        if energy > 100:  # 提高阈值，减少显示频率
                            status = f"监听中... 能量: {energy:.0f}"
                            print(status, end='\r')
                    else:
                        # 标准批处理模式
                        self.audio_buffer.append(data)

                        # 限制缓冲区大小
                        if len(self.audio_buffer) > self.buffer_size:
                            self.audio_buffer.pop(0)

                        # 批处理识别
                        if self._should_process_audio():
                            combined_data = self._process_audio_batch()
                            if combined_data:
                                text = self._lightweight_recognition(self.recognizer, combined_data)
                                if text:
                                    print(f"识别: {text}")

                                    # 检查唤醒词
                                    is_wake_word, detected_word = self._check_wake_word(text)
                                    if is_wake_word:
                                        print(f"🎯 检测到唤醒词: {detected_word}")
                                        self._start_recording()

                        # 显示实时音频级别
                        energy = self._calculate_energy(data)
                        if energy > 50:
                            status = f"监听中... 能量: {energy:.0f}"
                            print(status, end='\r')

                # 激进模式：更长的延迟以减少CPU使用
                sleep_time = 0.1 if self.AGGRESSIVE_MODE else 0.05
                time.sleep(sleep_time)

        except KeyboardInterrupt:
            print("\n👋 退出")
        except Exception as e:
            print(f"错误: {e}")
        finally:
            self.stop()

    def stop(self):
        """停止"""
        self.running = False
        if self.recording:
            self._stop_recording()

        if self.stream:
            self.stream.stop_stream()
            self.stream.close()
            self.stream = None

        if self.audio:
            self.audio.terminate()
            self.audio = None

def main():
    """主函数"""
    print("🚀 简化唤醒+录音测试")
    print("=" * 50)

    # 检查模型
    model_dir = "model"
    if not os.path.exists(model_dir):
        print("⚠️  未找到模型目录")
        print("请下载 Vosk 模型到 model 目录")
        return

    # 创建系统
    system = SimpleWakeAndRecord(
        model_path=model_dir,
        wake_words=["你好", "助手", "小爱"]
    )

    if not system.model:
        print("❌ 模型加载失败")
        return

    print("✅ 系统初始化成功")
    print("📖 使用说明:")
    print("1. 说唤醒词开始录音")
    print("2. 基于语音识别判断，3秒没有识别到文字就结束")
    print("3. 最少录音2秒，最多30秒")
    print("4. 录音时实时显示识别结果")
    print("5. 录音文件自动保存")
    print("6. 录音完成后自动播放刚才录制的内容")
    print("7. 按 Ctrl+C 退出")
    print("🚀 激进性能优化已启用:")
    print("   - 采样率: 8kHz (降低50%数据量)")
    print("   - 块大小: 4096字节 (4倍于原始大小)")
    print("   - 激进模式: 已启用 (直接处理，跳过部分结果)")
    print("   - 批处理: 3个音频块/次")
    print("   - 处理间隔: 0.2秒")
    print("   - 缓冲区: 5个音频块")
    print("   - 词级识别: 已禁用 (提高性能)")
    print("   - 性能监控: 每3秒显示")
    print("   - 延迟监控: 实时显示")
    print("   - 预期延迟: <1秒 (原10秒)")
    print("=" * 50)

    # 开始运行
    system.start()

if __name__ == "__main__":
    main()