fix

2025-09-18 20:13:08 +08:00 · 2025-09-18 20:13:08 +08:00 · 313e555589
commit 313e555589
parent 20fbd07675
2 changed files with 482 additions and 0 deletions
--- a/test_audio.py
+++ b/test_audio.py
@ -0,0 +1,101 @@
 #!/usr/bin/env python3
 """
 简单的音频测试脚本，用于诊断树莓派上的音频问题
 """
 import pyaudio
 import time
 import os
 def test_audio():
    """测试音频设备"""
    print("=== 音频设备测试 ===")
    pa = pyaudio.PyAudio()
    # 列出所有设备
    print("\n可用的音频设备:")
    for i in range(pa.get_device_count()):
        info = pa.get_device_info_by_index(i)
        print(f"  设备 {i}: {info['name']}")
        print(f"    输入通道: {info['maxInputChannels']}")
        print(f"    输出通道: {info['maxOutputChannels']}")
        print(f"    默认采样率: {info['defaultSampleRate']}")
        print()
    # 查找默认输入设备
    default_input = pa.get_default_input_device_info()
    print(f"默认输入设备: {default_input['name']} (索引: {default_input['index']})")
    # 查找默认输出设备
    default_output = pa.get_default_output_device_info()
    print(f"默认输出设备: {default_output['name']} (索引: {default_output['index']})")
    pa.terminate()
 def test_recording():
    """测试录音功能"""
    print("\n=== 录音测试 ===")
    pa = pyaudio.PyAudio()
    try:
        # 设置录音参数
        FORMAT = pyaudio.paInt16
        CHANNELS = 1
        RATE = 16000  # 降低采样率，使用设备默认的44100
        CHUNK = 1024
        print(f"尝试打开音频流，采样率: {RATE}")
        # 打开音频流
        stream = pa.open(
            format=FORMAT,
            channels=CHANNELS,
            rate=RATE,
            input=True,
            frames_per_buffer=CHUNK
        )
        print("开始录音5秒...")
        frames = []
        # 录音5秒
        for i in range(0, int(RATE / CHUNK * 5)):
            data = stream.read(CHUNK)
            frames.append(data)
            if i % 10 == 0:
                print(f"录音中... {i * CHUNK / RATE:.1f}秒")
        print("录音完成")
        # 停止流
        stream.stop_stream()
        stream.close()
        # 播放录音
        print("播放录音...")
        stream = pa.open(
            format=FORMAT,
            channels=CHANNELS,
            rate=RATE,
            output=True
        )
        for frame in frames:
            stream.write(frame)
        stream.stop_stream()
        stream.close()
        print("播放完成")
    except Exception as e:
        print(f"录音测试失败: {e}")
    finally:
        pa.terminate()
 if __name__ == "__main__":
    test_audio()
    test_recording()
--- a/voice_assistant_pi.py
+++ b/voice_assistant_pi.py
@ -0,0 +1,381 @@
 #!/usr/bin/env python3
 """
 Voice Assistant: Real-Time Voice Chat (修复版)
 修复了树莓派上的音频设备问题
 """
 import io
 import json
 import os
 import queue
 import re
 import subprocess
 import threading
 import time
 import wave
 import numpy as np
 import pyaudio
 import requests
 import soxr
 from pydub import AudioSegment
 from vosk import KaldiRecognizer, Model
 # ------------------- TIMING UTILITY -------------------
 class Timer:
    def __init__(self, label):
        self.label = label
        self.enabled = True
    def __enter__(self):
        self.start = time.time()
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.enabled:
            elapsed_ms = (time.time() - self.start) * 1000
            print(f"[Timing] {self.label}: {elapsed_ms:.0f} ms")
    def disable(self):
        self.enabled = False
 # ------------------- FUNCTIONS -------------------
 def get_input_device_index(preferred_name=None):
    pa = pyaudio.PyAudio()
    try:
        # 首先尝试获取默认设备
        if preferred_name is None:
            default_input = pa.get_default_input_device_info()
            print(f"[Debug] Using default input device: {default_input['name']}")
            return default_input['index']
        # 如果有指定名称，尝试匹配
        for i in range(pa.get_device_count()):
            info = pa.get_device_info_by_index(i)
            if info['maxInputChannels'] > 0 and preferred_name.lower() in info['name'].lower():
                print(f"[Debug] Selected input device {i}: {info['name']}")
                print(f"[Debug] Device sample rate: {info['defaultSampleRate']} Hz")
                return i
        # 如果没找到，使用默认设备
        default_input = pa.get_default_input_device_info()
        print(f"[Warning] Preferred mic not found. Using default: {default_input['name']}")
        return default_input['index']
    finally:
        pa.terminate()
 def get_output_device_index(preferred_name=None):
    pa = pyaudio.PyAudio()
    try:
        # 首先尝试获取默认设备
        if preferred_name is None:
            default_output = pa.get_default_output_device_info()
            print(f"[Debug] Using default output device: {default_output['name']}")
            return default_output['index']
        # 如果有指定名称，尝试匹配
        for i in range(pa.get_device_count()):
            info = pa.get_device_info_by_index(i)
            if info['maxOutputChannels'] > 0 and preferred_name.lower() in info['name'].lower():
                print(f"[Debug] Selected output device {i}: {info['name']}")
                return i
        # 如果没找到，使用默认设备
        default_output = pa.get_default_output_device_info()
        print(f"[Warning] Preferred output device not found. Using default: {default_output['name']}")
        return default_output['index']
    finally:
        pa.terminate()
 def list_input_devices():
    pa = pyaudio.PyAudio()
    try:
        print("[Debug] Available input devices:")
        for i in range(pa.get_device_count()):
            info = pa.get_device_info_by_index(i)
            if info['maxInputChannels'] > 0:
                print(f"  {i}: {info['name']} ({int(info['defaultSampleRate'])} Hz, {info['maxInputChannels']}ch)")
    finally:
        pa.terminate()
 def resample_audio(data, orig_rate=44100, target_rate=16000):
    # Convert byte string to numpy array
    audio_np = np.frombuffer(data, dtype=np.int16)
    # Resample using soxr
    resampled_np = soxr.resample(audio_np, orig_rate, target_rate)
    # Convert back to bytes
    return resampled_np.astype(np.int16).tobytes()
 # ------------------- PATHS -------------------
 CONFIG_PATH = os.path.expanduser("va_config.json")
 BASE_DIR = os.path.dirname(__file__)
 MODEL_PATH = os.path.join(BASE_DIR, 'vosk-model')
 CHAT_URL = 'https://open.bigmodel.cn/api/paas/v4/chat/completions'
 AUTH_TOKEN = '0c9cbaca9d2bbf864990f1e1decdf340.dXRMsZCHTUbPQ0rm'
 # ------------------- CONFIG FILE LOADING -------------------
 DEFAULT_CONFIG = {
    "volume": 8,
    "mic_name": None,
    "audio_output_device": None,
    "model_name": "glm-4.5",
    "voice": "en_US-kathleen-low.onnx",
    "enable_audio_processing": False,
    "history_length": 4,
    "system_prompt": "You are a helpful assistant."
 }
 def load_config():
    if os.path.isfile(CONFIG_PATH):
        try:
            with open(CONFIG_PATH, 'r') as f:
                user_config = json.load(f)
            return {**DEFAULT_CONFIG, **user_config}
        except Exception as e:
            print(f"[Warning] Failed to load system config: {e}")
    print("[Debug] Using default config.")
    return DEFAULT_CONFIG
 config = load_config()
 # Apply loaded config values
 VOLUME = config["volume"]
 MIC_NAME = config["mic_name"]
 AUDIO_OUTPUT_DEVICE = config["audio_output_device"]
 AUDIO_OUTPUT_DEVICE_INDEX = get_output_device_index(config["audio_output_device"])
 MODEL_NAME = config["model_name"]
 VOICE_MODEL = os.path.join("voices", config["voice"])
 ENABLE_AUDIO_PROCESSING = config["enable_audio_processing"]
 HISTORY_LENGTH = config["history_length"]
 # Setup messages with system prompt
 messages = [{"role": "system", "content": config["system_prompt"]}]
 list_input_devices()
 DEVICE_INDEX = get_input_device_index(config["mic_name"])
 # 从设备获取采样率
 pa = pyaudio.PyAudio()
 device_info = pa.get_device_info_by_index(DEVICE_INDEX)
 INPUT_RATE = int(device_info['defaultSampleRate'])
 OUTPUT_RATE = int(device_info['defaultSampleRate'])
 pa.terminate()
 CHUNK = 1024
 CHANNELS = 1
 mic_enabled = True
 print(f"[Debug] Using sample rate: {INPUT_RATE} Hz")
 print(f"[Debug] Config loaded: model={MODEL_NAME}, voice={config['voice']}, vol={VOLUME}")
 # ------------------- CONVERSATION STATE -------------------
 audio_queue = queue.Queue()
 # Audio callback
 def audio_callback(in_data, frame_count, time_info, status):
    global mic_enabled
    if not mic_enabled:
        return (None, pyaudio.paContinue)
    resampled_data = resample_audio(in_data, orig_rate=INPUT_RATE, target_rate=16000)
    audio_queue.put(resampled_data)
    return (None, pyaudio.paContinue)
 # ------------------- STREAM SETUP -------------------
 def start_stream():
    pa = pyaudio.PyAudio()
    stream = pa.open(
        rate=INPUT_RATE,  # 使用设备的默认采样率
        format=pyaudio.paInt16,
        channels=CHANNELS,
        input=True,
        input_device_index=DEVICE_INDEX,
        frames_per_buffer=CHUNK,
        stream_callback=audio_callback
    )
    stream.start_stream()
    print(f'[Debug] Stream @ {INPUT_RATE}Hz')
    return pa, stream
 # ------------------- QUERY GLM API -------------------
 def query_glm():
    headers = {
        'Authorization': 'Bearer ' + AUTH_TOKEN,
        'Content-Type': 'application/json'
    }
    payload = {
        "model": "glm-4.5",
        "messages": [messages[0]] + messages[-HISTORY_LENGTH:],
        "temperature": 0.6,
        "max_tokens": 1024,
        "stream": False
    }
    with Timer("Inference"):
        try:
            resp = requests.post(CHAT_URL, json=payload, headers=headers)
            resp.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"[Error] GLM API request failed: {e}")
            return ''
    data = resp.json()
    reply = ''
    if 'choices' in data and len(data['choices']) > 0:
        choice = data['choices'][0]
        if 'message' in choice and 'content' in choice['message']:
            reply = choice['message']['content'].strip()
    return reply
 # ------------------- TTS & DEGRADATION -------------------
 def play_response(text):
    global mic_enabled
    mic_enabled = False
    # clean the response
    clean = re.sub(r"[\*]+", '', text)
    clean = re.sub(r"\(.*?\)", '', clean)
    clean = re.sub(r"<.*?>", '', clean)
    clean = clean.replace('\n', ' ').strip()
    clean = re.sub(r'\s+', ' ', clean)
    clean = re.sub(r'[\U0001F300-\U0001FAFF\u2600-\u26FF\u2700-\u27BF]+', '', clean)
    piper_path = os.path.join(BASE_DIR, 'bin', 'piper', 'piper')
    if not os.path.exists(piper_path):
        print(f"[Error] Piper executable not found at {piper_path}")
        mic_enabled = True
        return
    try:
        # Generate Piper raw PCM
        with Timer("Piper inference"):
            piper_proc = subprocess.Popen(
                [piper_path, '--model', VOICE_MODEL, '--output_raw'],
                stdin=subprocess.PIPE,
                stdout=subprocess.PIPE,
                stderr=subprocess.DEVNULL
            )
            tts_pcm, _ = piper_proc.communicate(input=clean.encode())
        # Convert raw PCM to WAV for playback
        wav_io = io.BytesIO()
        with wave.open(wav_io, 'wb') as wf:
            wf.setnchannels(1)
            wf.setsampwidth(2)
            wf.setframerate(16000)
            wf.writeframes(tts_pcm)
        wav_io.seek(0)
        wf = wave.open(wav_io, 'rb')
        # Playback
        with Timer("Playback"):
            pa = pyaudio.PyAudio()
            stream = pa.open(
                format=pa.get_format_from_width(wf.getsampwidth()),
                channels=wf.getnchannels(),
                rate=wf.getframerate(),
                output=True,
                output_device_index=AUDIO_OUTPUT_DEVICE_INDEX
            )
            data = wf.readframes(CHUNK)
            while data:
                stream.write(data)
                data = wf.readframes(CHUNK)
            stream.stop_stream()
            stream.close()
            pa.terminate()
            wf.close()
    except Exception as e:
        print(f"[Error] TTS playback failed: {e}")
    finally:
        mic_enabled = True
        time.sleep(0.3)
 # ------------------- PROCESSING LOOP -------------------
 def processing_loop():
    try:
        model = Model(MODEL_PATH)
        print("[Debug] Vosk model loaded successfully")
    except Exception as e:
        print(f"[Error] Failed to load Vosk model: {e}")
        print(f"[Info] Model path: {MODEL_PATH}")
        return
    rec = KaldiRecognizer(model, 16000)
    MAX_DEBUG_LEN = 200
    LOW_EFFORT_UTTERANCES = {"huh", "uh", "um", "erm", "hmm", "he's", "but"}
    while True:
        try:
            data = audio_queue.get()
            if rec.AcceptWaveform(data):
                start = time.time()
                r = json.loads(rec.Result())
                elapsed_ms = int((time.time() - start) * 1000)
                user = r.get('text', '').strip()
                if user:
                    print(f"[Timing] STT parse: {elapsed_ms} ms")
                    print("User:", user)
                    if user.lower().strip(".,!? ") in LOW_EFFORT_UTTERANCES:
                        print("[Debug] Ignored low-effort utterance.")
                        rec = KaldiRecognizer(model, 16000)
                        continue
                    messages.append({"role": "user", "content": user})
                    resp_text = query_glm()
                    if resp_text:
                        clean_debug_text = resp_text.replace('\n', ' ').replace('\r', ' ')
                        if len(clean_debug_text) > MAX_DEBUG_LEN:
                            clean_debug_text = clean_debug_text[:MAX_DEBUG_LEN] + '...'
                        print('Assistant:', clean_debug_text)
                        messages.append({"role": "assistant", "content": clean_debug_text})
                        play_response(resp_text)
                    else:
                        print('[Debug] Empty response, skipping TTS.')
                    rec = KaldiRecognizer(model, 16000)
        except Exception as e:
            print(f"[Error] Processing loop error: {e}")
            time.sleep(1)
 # ------------------- MAIN -------------------
 if __name__ == '__main__':
    try:
        pa, stream = start_stream()
        t = threading.Thread(target=processing_loop, daemon=True)
        t.start()
        print("[Debug] Voice assistant started. Press Ctrl+C to exit.")
        while stream.is_active():
            time.sleep(0.1)
    except KeyboardInterrupt:
        print("[Debug] Shutting down...")
        stream.stop_stream()
        stream.close()
        pa.terminate()
    except Exception as e:
        print(f"[Error] Main loop error: {e}")
        stream.stop_stream()
        stream.close()
        pa.terminate()