diff --git a/.DS_Store b/.DS_Store index 973cf92..2f240b4 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 0403e55..0000000 --- a/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2025 m15-ai - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/README.md b/README.md deleted file mode 100644 index 29f0c99..0000000 --- a/README.md +++ /dev/null @@ -1,333 +0,0 @@ -# Local Voice Assistant (Offline, Real-Time AI) - -**Lightweight, low-latency voice assistant running fully offline on a Raspberry Pi or Linux machine.** -Powered by PyAudio, Vosk STT, Piper TTS, and local LLMs via Ollama. - -![badge](https://img.shields.io/badge/Offline-Voice%20AI-blue) -![badge](https://img.shields.io/badge/Audio-PyAudio-yellow) -![badge](https://img.shields.io/badge/TTS-Piper-orange) -![badge](https://img.shields.io/badge/LLM-Gemma2%20%7C%20Qwen-success) - ---- - -## 🎯 Features - -- 🎙️ **Microphone Input** using PyAudio -- 🔊 **Real-Time Transcription** with [Vosk](https://alphacephei.com/vosk/) -- 🧠 **LLM-Powered Responses** using [Ollama](https://ollama.com) with models like `gemma2:2b`, `qwen2.5:0.5b` -- 🗣️ **Natural Voice Output** via [Piper TTS](https://github.com/rhasspy/piper) -- 🎛️ Optional **Noise & Filter FX** using SoX for realism -- 🔧 ALSA **Volume Control** -- 🧩 Modular Python code ready for customization - ---- - -## 🛠 Requirements - -- Raspberry Pi 5 or Linux desktop -- Python 3.9+ -- PyAudio, NumPy, requests, soxr, pydub, vosk -- SoX + ALSA utilities -- Ollama with one or more small LLMs (e.g., Gemma or Qwen) -- Piper TTS with ONNX voice models - -Install dependencies: - -``` -pip install pyaudio requests soxr numpy pydub vosk -sudo apt install sox alsa-utils -``` - -## ⚙️ JSON Configuration - -Place a config file at va_config.json: - -``` -{ - "volume": 8, - "mic_name": "Plantronics", - "audio_output_device": "Plantronics", - "model_name": "gemma2:2b", - "voice": "en_US-kathleen-low.onnx", - "enable_audio_processing": false, - "history_length": 6, - "system_prompt": "You are a helpful assistant." -} -``` - -Note: if the configuration file is not found, defaults withing the main python app will be used: - -``` -# ------------------- CONFIG FILE LOADING ------------------- -DEFAULT_CONFIG = { - "volume": 9, - "mic_name": "Plantronics", - "audio_output_device": "Plantronics", - "model_name": "qwen2.5:0.5b", - "voice": "en_US-kathleen-low.onnx", - "enable_audio_processing": False, - "history_length": 4, - "system_prompt": "You are a helpful assistant." -} -``` - -### 🔁 What `history_length` Means - -The `history_length` setting controls how many previous exchanges (user + assistant messages) are included when generating each new reply. - -- A value of `6` means the model receives the last 6 exchanges, plus the system prompt. -- This allows the assistant to maintain **short-term memory** for more coherent conversations. -- Setting it lower (e.g., `2`) increases speed and memory efficiency. - -### ✅ `requirements.txt` - -``` -pyaudio -vosk -soxr -numpy -requests -pydub -``` - -If you plan to run this on a Raspberry Pi, you may also need: - -``` -soundfile # for pydub compatibility on some distros -``` - -## 🐍 Install with Virtual Environment - -``` -# 1. Clone the repo - -git clone https://github.com/your-username/voice-assistant-local.git -cd voice-assistant-local - -# 2. Create and activate a virtual environment - -python3 -m venv env -source env/bin/activate - -# 3. Install dependencies - -pip install -r requirements.txt - -# 4. Install SoX and ALSA utilities (if not already installed) - -sudo apt install sox alsa-utils - -# 5. (Optional) Test PyAudio installation - -python -m pip install --upgrade pip setuptools wheel -``` - -> 💡 If you get errors installing PyAudio on Raspberry Pi, try: -> -> ``` -> sudo apt install portaudio19-dev -> pip install pyaudio -> ``` - -## 🆕 🔧 Piper Installation (Binary) - -Piper is a standalone text-to-speech engine used by this assistant. It's **not a Python package**, so it must be installed manually. - -#### ✅ Install Piper - -1. Download the appropriate Piper binary from: - 👉 https://github.com/rhasspy/piper/releases - - For Ubuntu Linux, download: - `piper_linux_x86_64.tar.gz` - -2. Extract it: - - ``` - tar -xvzf piper_linux_x86_64.tar.gz - ``` - -3. Move the binary into your project directory: - - ``` - mkdir -p bin/piper - mv piper bin/piper/ - chmod +x bin/piper/piper - ``` - -4. ✅ Done! The script will automatically call it from `bin/piper/piper`. - -## 📂 Directory Example - -``` -voice_assistant.py -va_config.json -requirements.txt -bin/ -└── piper/ - └── piper ← (binary) -voices/ -└── en_US-kathleen-low.onnx -└── en_US-kathleen-low.onnx.json -``` - - - -## 🔌 Finding Your USB Microphone & Speaker - -To configure the correct audio devices, use these commands on your Raspberry Pi or Linux terminal: - -1. List Microphones (Input Devices) - -``` -python3 -m pip install pyaudio -python3 -c "import pyaudio; p = pyaudio.PyAudio(); \ -[print(f'{i}: {p.get_device_info_by_index(i)}') for i in range(p.get_device_count())]" -``` - -Look for your microphone name (e.g., Plantronics) and use that as mic_name. -2. List Speakers (Output Devices) - -``` -aplay -l -``` - -Example output: - -``` -card 3: Device [USB PnP Sound Device], device 0: USB Audio [USB Audio] -``` - -Use this info to set your audio_output_device to something like: - -``` -"audio_output_device": "USB PnP" -``` - -## 🔧 Ollama Installation (Required) - -Ollama is a local model runner for LLMs. You need to install it separately (outside of Python). - -#### 💻 Install Ollama - -On **Linux (x86 or ARM)**: - -``` -curl -fsSL https://ollama.com/install.sh | sh -``` - -Or follow detailed instructions: - 👉 https://ollama.com/download - -Then start the daemon: - -``` -ollama serve -``` - -#### 📥 Download the Models - -After Ollama is installed and running, open a terminal and run: - -##### ✅ For Gemma 2B: - -``` -ollama run gemma2:2b -``` - -##### For Qwen 0.5B: - -``` -ollama run qwen2.5:0.5b -``` - -This will automatically download and start the models. You only need to run this once per model. - -##### ⚠️ Reminder - -> Ollama is **not a Python package** — it is a background service. -> Do **not** add it to `requirements.txt`. Just make sure it’s installed and running before launching the assistant. - -## 🎤 Installing Piper Voice Models - -To enable speech synthesis, you'll need to download a **voice model (.onnx)** and its matching **config (.json)** file. - -#### ✅ Steps: - -1. Visit the official Piper voices list: - 📄 https://github.com/rhasspy/piper/blob/master/VOICES.md - -2. Choose a voice you like (e.g., `en_US-lessac-medium` or `en_US-amy-low`). - -3. Download **both** files for your chosen voice: - - - `voice.onnx` - - `config.json` - -4. If you wish, you can rename the ONNX file and config file using the same base name. For example: - - ``` - amy-low.onnx - amy-low.json - ``` - -5. Place both files in a directory called `voices/` next to your script. - Example Directory Structure: - - ``` - voice_assistant.py - voices/ - ├── amy-low.onnx - └── amy-low.json - ``` - -6. Update your `config.json`: - - ``` - "voice": "amy-low.onnx" - ``` - -> ⚠️ Make sure both `.onnx` and `.json` are present in the `voices/` folder with matching names (excluding the extension). - -## 🧪 **Performance Report** - -The script prints out debug timing for the STT, LLM, and TTS parts of the pipeline. I asked ChatGPT4 to analyze some of the results i obtained. - -**System:** Ubuntu laptop, Intel Core i5 - **Model:** `qwen2.5:0.5b` (local via Ollama) - **TTS:** `piper` with `en_US-kathleen-low.onnx` - **Audio:** Plantronics USB headset - ------- - -### 📊 **Timing Metrics (avg)** - -| Stage | Metric (ms) | Notes | -| -------------- | ------------- | --------------------------------------- | -| STT Parse | 4.5 ms avg | Vosk transcribes near-instantly | -| LLM Inference | ~2,200 ms avg | Ranges from ~1s (short queries) to 5s | -| TTS Generation | ~1,040 ms avg | Piper ONNX performs well on CPU | -| Audio Playback | ~7,250 ms avg | Reflects actual audio length, not delay | - -### ✅ Observations - -- **STT speed is excellent** — under 10 ms consistently. -- **LLM inference is snappy** for a 0.5b model running locally. Your best response came in under 1.1 sec. -- **TTS is consistent and fast** — Kathleen-low voice is fully synthesized in ~800–1600 ms. -- **Playback timing matches response length** — no lag, just actual audio time. -- End-to-end round trip time from speaking to hearing a reply is about **8–10 seconds**, including speech and playback time. - -## 💡 Use Cases - -- ​ Offline smart assistants - -- ​ Wearable or embedded AI demos - -- ​ Voice-controlled kiosks - -- ​ Character-based roleplay agents - - -## 📄 License - -MIT © 2024 M15.ai \ No newline at end of file diff --git a/__pycache__/voice_recorder.cpython-312.pyc b/__pycache__/voice_recorder.cpython-312.pyc new file mode 100644 index 0000000..2fb0be2 Binary files /dev/null and b/__pycache__/voice_recorder.cpython-312.pyc differ diff --git a/__pycache__/vosk_wake_word.cpython-312.pyc b/__pycache__/vosk_wake_word.cpython-312.pyc new file mode 100644 index 0000000..5972d5e Binary files /dev/null and b/__pycache__/vosk_wake_word.cpython-312.pyc differ diff --git a/doubao/README.md b/doubao/README.md deleted file mode 100644 index 7c00a7d..0000000 --- a/doubao/README.md +++ /dev/null @@ -1,37 +0,0 @@ -# RealtimeDialog - -实时语音对话程序,支持语音输入和语音输出。 - -## 使用说明 - -此demo使用python3.7环境进行开发调试,其他python版本可能会有兼容性问题,需要自己尝试解决。 - -1. 配置API密钥 - - 打开 `config.py` 文件 - - 修改以下两个字段: - ```python - "X-Api-App-ID": "火山控制台上端到端大模型对应的App ID", - "X-Api-Access-Key": "火山控制台上端到端大模型对应的Access Key", - ``` - - 修改speaker字段指定发音人,本次支持四个发音人: - - `zh_female_vv_jupiter_bigtts`:中文vv女声 - - `zh_female_xiaohe_jupiter_bigtts`:中文xiaohe女声 - - `zh_male_yunzhou_jupiter_bigtts`:中文云洲男声 - - `zh_male_xiaotian_jupiter_bigtts`:中文小天男声 - -2. 安装依赖 - ```bash - pip install -r requirements.txt - -3. 通过麦克风运行程序 - ```bash - python main.py --format=pcm - ``` -4. 通过录音文件启动程序 - ```bash - python main.py --audio=whoareyou.wav - ``` -5. 通过纯文本输入和程序交互 - ```bash - python main.py --mod=text --recv_timeout=120 - ``` \ No newline at end of file diff --git a/doubao/__pycache__/audio_manager.cpython-312.pyc b/doubao/__pycache__/audio_manager.cpython-312.pyc deleted file mode 100644 index d6b7b3d..0000000 Binary files a/doubao/__pycache__/audio_manager.cpython-312.pyc and /dev/null differ diff --git a/doubao/__pycache__/audio_manager.cpython-37.pyc b/doubao/__pycache__/audio_manager.cpython-37.pyc deleted file mode 100644 index 62b8a23..0000000 Binary files a/doubao/__pycache__/audio_manager.cpython-37.pyc and /dev/null differ diff --git a/doubao/__pycache__/config.cpython-312.pyc b/doubao/__pycache__/config.cpython-312.pyc deleted file mode 100644 index d526e24..0000000 Binary files a/doubao/__pycache__/config.cpython-312.pyc and /dev/null differ diff --git a/doubao/__pycache__/config.cpython-37.pyc b/doubao/__pycache__/config.cpython-37.pyc deleted file mode 100644 index 52a32ac..0000000 Binary files a/doubao/__pycache__/config.cpython-37.pyc and /dev/null differ diff --git a/doubao/__pycache__/protocol.cpython-312.pyc b/doubao/__pycache__/protocol.cpython-312.pyc deleted file mode 100644 index 76ede0e..0000000 Binary files a/doubao/__pycache__/protocol.cpython-312.pyc and /dev/null differ diff --git a/doubao/__pycache__/protocol.cpython-37.pyc b/doubao/__pycache__/protocol.cpython-37.pyc deleted file mode 100644 index 925c1d9..0000000 Binary files a/doubao/__pycache__/protocol.cpython-37.pyc and /dev/null differ diff --git a/doubao/__pycache__/realtime_dialog_client.cpython-312.pyc b/doubao/__pycache__/realtime_dialog_client.cpython-312.pyc deleted file mode 100644 index 6ac9fc6..0000000 Binary files a/doubao/__pycache__/realtime_dialog_client.cpython-312.pyc and /dev/null differ diff --git a/doubao/__pycache__/realtime_dialog_client.cpython-37.pyc b/doubao/__pycache__/realtime_dialog_client.cpython-37.pyc deleted file mode 100644 index b2f60c7..0000000 Binary files a/doubao/__pycache__/realtime_dialog_client.cpython-37.pyc and /dev/null differ diff --git a/doubao/audio_manager.py b/doubao/audio_manager.py deleted file mode 100644 index a1b93dd..0000000 --- a/doubao/audio_manager.py +++ /dev/null @@ -1,695 +0,0 @@ -import asyncio -import queue -import random -import signal -import sys -import threading -import time -import uuid -import wave -from dataclasses import dataclass -from typing import Any, Dict, Optional - -import config -import pyaudio -from realtime_dialog_client import RealtimeDialogClient - - -@dataclass -class AudioConfig: - """音频配置数据类""" - format: str - bit_size: int - channels: int - sample_rate: int - chunk: int - - -class AudioDeviceManager: - """音频设备管理类,处理音频输入输出""" - - def __init__(self, input_config: AudioConfig, output_config: AudioConfig): - self.input_config = input_config - self.output_config = output_config - self.pyaudio = pyaudio.PyAudio() - self.input_stream: Optional[pyaudio.Stream] = None - self.output_stream: Optional[pyaudio.Stream] = None - - def open_input_stream(self) -> pyaudio.Stream: - """打开音频输入流""" - # p = pyaudio.PyAudio() - self.input_stream = self.pyaudio.open( - format=self.input_config.bit_size, - channels=self.input_config.channels, - rate=self.input_config.sample_rate, - input=True, - frames_per_buffer=self.input_config.chunk - ) - return self.input_stream - - def open_output_stream(self) -> pyaudio.Stream: - """打开音频输出流""" - self.output_stream = self.pyaudio.open( - format=self.output_config.bit_size, - channels=self.output_config.channels, - rate=self.output_config.sample_rate, - output=True, - frames_per_buffer=self.output_config.chunk - ) - return self.output_stream - - def cleanup(self) -> None: - """清理音频设备资源""" - for stream in [self.input_stream, self.output_stream]: - if stream: - stream.stop_stream() - stream.close() - self.pyaudio.terminate() - - -class DialogSession: - """对话会话管理类""" - is_audio_file_input: bool - mod: str - - def __init__(self, ws_config: Dict[str, Any], output_audio_format: str = "pcm", audio_file_path: str = "", - mod: str = "audio", recv_timeout: int = 10): - self.audio_file_path = audio_file_path - self.recv_timeout = recv_timeout - self.is_audio_file_input = self.audio_file_path != "" - if self.is_audio_file_input: - mod = 'audio_file' - else: - self.say_hello_over_event = asyncio.Event() - self.mod = mod - - self.session_id = str(uuid.uuid4()) - self.client = RealtimeDialogClient(config=ws_config, session_id=self.session_id, - output_audio_format=output_audio_format, mod=mod, recv_timeout=recv_timeout) - if output_audio_format == "pcm_s16le": - config.output_audio_config["format"] = "pcm_s16le" - config.output_audio_config["bit_size"] = pyaudio.paInt16 - - self.is_running = True - self.is_session_finished = False - self.is_user_querying = False - self.is_sending_chat_tts_text = False - self.audio_buffer = b'' - self.is_playing_audio = False # 是否正在播放音频 - self.audio_queue_lock = threading.Lock() # 音频队列锁 - self.is_recording_paused = False # 录音是否被暂停 - self.should_send_silence = False # 是否需要发送静音数据 - self.silence_send_count = 0 # 需要发送的静音数据数量 - self.pre_pause_time = 0 # 预暂停时间 - self.last_recording_state = False # 上次录音状态 - self.say_hello_completed = False # say hello 是否已完成 - - # 新增:音频输入流控制 - self.input_stream_paused = False # 输入流是否被暂停 - self.force_silence_mode = False # 强制静音模式 - self.echo_suppression_start_time = 0 # 回声抑制开始时间 - - signal.signal(signal.SIGINT, self._keyboard_signal) - self.audio_queue = queue.Queue() - if not self.is_audio_file_input: - self.audio_device = AudioDeviceManager( - AudioConfig(**config.input_audio_config), - AudioConfig(**config.output_audio_config) - ) - # 初始化音频队列和输出流 - print(f"输出音频配置: {config.output_audio_config}") - self.output_stream = self.audio_device.open_output_stream() - print("音频输出流已打开") - # 启动播放线程 - self.is_recording = True - self.is_playing = True - self.player_thread = threading.Thread(target=self._audio_player_thread) - self.player_thread.daemon = True - self.player_thread.start() - - def _audio_player_thread(self): - """音频播放线程""" - audio_playing_timeout = 1.0 # 1秒没有音频数据认为播放结束 - queue_check_interval = 0.1 # 每100ms检查一次队列状态 - - while self.is_playing: - try: - # 从队列获取音频数据 - audio_data = self.audio_queue.get(timeout=queue_check_interval) - if audio_data is not None: - with self.audio_queue_lock: - # 第三重保险:播放开始时最终确认暂停状态 - was_not_playing = not self.is_playing_audio - if not hasattr(self, 'last_audio_time') or was_not_playing: - # 从非播放状态进入播放状态 - self.is_playing_audio = True - # 确保录音已暂停 - if not self.is_recording_paused: - self.is_recording_paused = True - print("播放开始,最终确认暂停录音") - - # 更新最后音频时间 - self.last_audio_time = time.time() - - # 播放前额外发送静音数据清理管道 - if was_not_playing: - print("播放开始前,额外发送静音数据清理管道") - for _ in range(3): - self.output_stream.write(b'\x00' * len(audio_data)) - time.sleep(0.1) - - # 播放音频数据 - self.output_stream.write(audio_data) - - except queue.Empty: - # 队列为空,检查是否超时 - current_time = time.time() - with self.audio_queue_lock: - if self.is_playing_audio: - if hasattr(self, 'last_audio_time') and current_time - self.last_audio_time > audio_playing_timeout: - # 超过1秒没有新音频,认为播放结束 - self.is_playing_audio = False - self.is_recording_paused = False - self.force_silence_mode = False # 关闭强制静音模式 - self.input_stream_paused = False # 恢复输入流 - # 标记 say hello 完成 - if hasattr(self, 'say_hello_completed') and not self.say_hello_completed: - self.say_hello_completed = True - print("say hello 音频播放完成") - print("音频播放超时,恢复录音") - # 直接发送静音数据,而不是在协程中发送 - try: - silence_data = b'\x00' * config.input_audio_config["chunk"] - # 使用同步方式发送静音数据 - # 这里我们设置一个标志,让主循环处理 - self.silence_send_count = 2 # 播放超时时发送2组静音数据 - self.should_send_silence = True - except Exception as e: - print(f"准备静音数据失败: {e}") - elif self.audio_queue.empty(): - # 队列为空,但还没超时,继续等待 - pass - time.sleep(0.01) - except Exception as e: - print(f"音频播放错误: {e}") - with self.audio_queue_lock: - self.is_playing_audio = False - self.is_recording_paused = False - time.sleep(0.1) - - # 移除了静音检测函数,避免干扰正常的音频处理 - - async def _send_silence_on_playback_end(self): - """播放结束时发送静音数据""" - try: - silence_data = b'\x00' * config.input_audio_config["chunk"] - await self.client.task_request(silence_data) - print("播放结束,已发送静音数据") - except Exception as e: - print(f"发送静音数据失败: {e}") - - def _check_and_restore_recording(self): - """检查并恢复录音状态""" - with self.audio_queue_lock: - if self.is_recording_paused and self.audio_queue.empty(): - # 如果队列为空且录音被暂停,恢复录音 - self.is_recording_paused = False - self.is_playing_audio = False - print("音频队列为空,自动恢复录音") - return True - return False - - def handle_server_response(self, response: Dict[str, Any]) -> None: - if not response or response == {}: - return - """处理服务器响应""" - message_type = response.get('message_type') - if message_type == 'SERVER_ACK' and isinstance(response.get('payload_msg'), bytes): - if self.is_sending_chat_tts_text: - return - audio_data = response['payload_msg'] - - # 第二重保险:接收到音频数据时确认暂停状态 - with self.audio_queue_lock: - was_not_playing = not self.is_playing_audio - if was_not_playing: - # 第一批音频数据到达,确保录音已暂停 - self.is_playing_audio = True - if not self.is_recording_paused: - self.is_recording_paused = True - print("接收到首批音频数据,立即暂停录音") - else: - print("接收到音频数据,录音已暂停") - - # 立即发送静音数据,确保管道清理 - self.silence_send_count = 3 # 音频数据到达时发送3组静音数据 - self.should_send_silence = True - print("服务器收到音频数据,立即清理录音管道") - - if not self.is_audio_file_input: - self.audio_queue.put(audio_data) - self.audio_buffer += audio_data - elif message_type == 'SERVER_FULL_RESPONSE': - print(f"服务器响应: {response}") - event = response.get('event') - payload_msg = response.get('payload_msg', {}) - - # 第一重保险:服务器开始响应时立即预暂停录音 - if event in [450, 359, 152, 153]: # 这些事件表示服务器开始或结束响应 - if event == 450: - print(f"清空缓存音频: {response['session_id']}") - while not self.audio_queue.empty(): - try: - self.audio_queue.get_nowait() - except queue.Empty: - continue - self.is_user_querying = True - print("服务器准备接收用户输入") - - # 预暂停录音,防止即将到来的音频回声 - with self.audio_queue_lock: - if not self.is_recording_paused: - self.is_recording_paused = True - self.is_playing_audio = True # 同时设置播放状态,双重保险 - self.pre_pause_time = time.time() - 2.0 # 提前2秒预暂停 - self.force_silence_mode = True # 启用强制静音模式 - self.echo_suppression_start_time = time.time() # 记录回声抑制开始时间 - print("服务器开始响应,预暂停录音防止回声") - - # 立即发送静音数据清理管道,防止前1-2秒回声 - print("预暂停期间立即发送静音数据清理管道") - # 设置批量静音发送,确保管道完全清理 - self.silence_send_count = 20 # 增加到20组,确保彻底清理 - self.should_send_silence = True - - # 强制重置录音状态 - self.last_recording_state = True # 标记为已暂停 - self.input_stream_paused = True # 暂停输入流 - - if event == 350 and self.is_sending_chat_tts_text and payload_msg.get("tts_type") in ["chat_tts_text", "external_rag"]: - while not self.audio_queue.empty(): - try: - self.audio_queue.get_nowait() - except queue.Empty: - continue - self.is_sending_chat_tts_text = False - - if event == 459: - self.is_user_querying = False - # 服务器完成响应,立即恢复录音 - with self.audio_queue_lock: - was_paused = self.is_recording_paused - self.is_recording_paused = False - self.is_playing_audio = False - self.force_silence_mode = False # 关闭强制静音模式 - self.input_stream_paused = False # 恢复输入流 - if was_paused: - print("服务器响应完成,立即恢复录音") - # 设置标志发送静音数据 - self.silence_send_count = 2 # 响应完成时发送2组静音数据 - self.should_send_silence = True - print("服务器完成响应,等待用户输入") - #if random.randint(0, 100000)%1 == 0: - # self.is_sending_chat_tts_text = True - #asyncio.create_task(self.trigger_chat_tts_text()) - #asyncio.create_task(self.trigger_chat_rag_text()) - elif message_type == 'SERVER_ERROR': - print(f"服务器错误: {response['payload_msg']}") - raise Exception("服务器错误") - - async def trigger_chat_tts_text(self): - """概率触发发送ChatTTSText请求""" - print("hit ChatTTSText event, start sending...") - await self.client.chat_tts_text( - is_user_querying=self.is_user_querying, - start=True, - end=False, - content="这是查询到外部数据之前的安抚话术。", - ) - await self.client.chat_tts_text( - is_user_querying=self.is_user_querying, - start=False, - end=True, - content="", - ) - - async def trigger_chat_rag_text(self): - await asyncio.sleep(5) # 模拟查询外部RAG的耗时,这里为了不影响GTA安抚话术的播报,直接sleep 5秒 - print("hit ChatRAGText event, start sending...") - await self.client.chat_rag_text(self.is_user_querying, external_rag='[{"title":"北京天气","content":"今天北京整体以晴到多云为主,但西部和北部地带可能会出现分散性雷阵雨,特别是午后至傍晚时段需注意突发降雨。\n💨 风况与湿度\n风力较弱,一般为 2–3 级南风或西南风\n白天湿度较高,早晚略凉爽"}]') - - def _keyboard_signal(self, sig, frame): - print(f"receive keyboard Ctrl+C") - self.stop() - - def stop(self): - self.is_recording = False - self.is_playing = False - self.is_running = False - - async def receive_loop(self): - try: - while True: - response = await self.client.receive_server_response() - self.handle_server_response(response) - if 'event' in response and (response['event'] == 152 or response['event'] == 153): - print(f"receive session finished event: {response['event']}") - self.is_session_finished = True - break - if 'event' in response and response['event'] == 359: - if self.is_audio_file_input: - print(f"receive tts ended event") - self.is_session_finished = True - break - else: - if not self.say_hello_over_event.is_set(): - print(f"receive tts sayhello ended event") - self.say_hello_over_event.set() - - # 对于音频模式,say hello 音频播放即将开始 - # 确保录音保持暂停状态 - if self.mod == "audio": - with self.audio_queue_lock: - self.is_recording_paused = True - self.is_playing_audio = True - print("say hello 音频即将开始,确保录音暂停") - - if self.mod == "text": - # 文本模式下 say hello 完成,恢复录音状态 - with self.audio_queue_lock: - if self.is_recording_paused: - self.is_recording_paused = False - print("文本模式:say hello 完成,恢复录音") - print("请输入内容:") - - except asyncio.CancelledError: - print("接收任务已取消") - except Exception as e: - print(f"接收消息错误: {e}") - finally: - self.stop() - self.is_session_finished = True - - async def process_audio_file(self) -> None: - await self.process_audio_file_input(self.audio_file_path) - - async def process_text_input(self) -> None: - # 程序启动后先静音2秒,确保系统稳定 - print("文本模式:程序启动,先静音2秒确保系统稳定...") - with self.audio_queue_lock: - self.is_recording_paused = True - self.is_playing_audio = True # 标记正在播放 - - # 发送2秒静音数据,确保管道清理 - silence_data = b'\x00' * config.input_audio_config["chunk"] - for i in range(20): # 2秒 = 20 * 100ms - await self.client.task_request(silence_data) - await asyncio.sleep(0.1) - if i % 10 == 0: # 每秒打印一次进度 - print(f"文本模式:静音中... {i//10 + 1}/2秒") - - print("文本模式:静音完成,准备 say hello") - - # say hello 前确保录音仍处于暂停状态 - with self.audio_queue_lock: - self.is_recording_paused = True - self.is_playing_audio = True # 标记正在播放 - print("文本模式:准备 say hello,确保录音暂停") - - await self.client.say_hello() - await self.say_hello_over_event.wait() - - """主逻辑:处理文本输入和WebSocket通信""" - # 确保连接最终关闭 - try: - # 启动输入监听线程 - input_queue = queue.Queue() - input_thread = threading.Thread(target=self.input_listener, args=(input_queue,), daemon=True) - input_thread.start() - # 主循环:处理输入和上下文结束 - while self.is_running: - try: - # 检查是否有输入(非阻塞) - input_str = input_queue.get_nowait() - if input_str is None: - # 输入流关闭 - print("Input channel closed") - break - if input_str: - # 发送输入内容 - await self.client.chat_text_query(input_str) - except queue.Empty: - # 无输入时短暂休眠 - await asyncio.sleep(0.1) - except Exception as e: - print(f"Main loop error: {e}") - break - finally: - print("exit text input") - - def input_listener(self, input_queue: queue.Queue) -> None: - """在单独线程中监听标准输入""" - print("Start listening for input") - try: - while True: - # 读取标准输入(阻塞操作) - line = sys.stdin.readline() - if not line: - # 输入流关闭 - input_queue.put(None) - break - input_str = line.strip() - input_queue.put(input_str) - except Exception as e: - print(f"Input listener error: {e}") - input_queue.put(None) - - async def process_audio_file_input(self, audio_file_path: str) -> None: - # 读取WAV文件 - with wave.open(audio_file_path, 'rb') as wf: - chunk_size = config.input_audio_config["chunk"] - framerate = wf.getframerate() # 采样率(如16000Hz) - # 时长 = chunkSize(帧数) ÷ 采样率(帧/秒) - sleep_seconds = chunk_size / framerate - print(f"开始处理音频文件: {audio_file_path}") - - # 分块读取并发送音频数据 - while True: - audio_data = wf.readframes(chunk_size) - if not audio_data: - break # 文件读取完毕 - - await self.client.task_request(audio_data) - # sleep与chunk对应的音频时长一致,模拟实时输入 - await asyncio.sleep(sleep_seconds) - - print(f"音频文件处理完成,等待服务器响应...") - - async def process_silence_audio(self) -> None: - """发送静音音频""" - silence_data = b'\x00' * 320 - await self.client.task_request(silence_data) - - async def process_microphone_input(self) -> None: - """处理麦克风输入""" - stream = self.audio_device.open_input_stream() - print("已打开麦克风,请讲话...") - print("音频处理已启动,播放时将发送静音数据避免回声") - - # 程序启动后先静音2秒,确保系统稳定 - print("程序启动,先静音2秒确保系统稳定...") - with self.audio_queue_lock: - self.is_recording_paused = True - self.is_playing_audio = True # 标记正在播放 - - # 发送2秒静音数据,确保管道清理 - silence_data = b'\x00' * config.input_audio_config["chunk"] - for i in range(20): # 2秒 = 20 * 100ms - await self.client.task_request(silence_data) - await asyncio.sleep(0.1) - if i % 10 == 0: # 每秒打印一次进度 - print(f"静音中... {i//10 + 1}/2秒") - - print("静音完成,准备 say hello") - - # say hello 前确保录音仍处于暂停状态 - with self.audio_queue_lock: - self.is_recording_paused = True - self.is_playing_audio = True # 标记正在播放 - print("准备 say hello,确保录音暂停") - - await self.client.say_hello() - await self.say_hello_over_event.wait() - - # 注意:不立即恢复录音状态,等待音频实际播放完成 - # 录音状态将由音频播放线程在播放超时后自动恢复 - print("say hello 请求完成,等待音频播放结束...") - - # 创建静音数据 - silence_data = b'\x00' * config.input_audio_config["chunk"] - last_silence_time = time.time() - - # say hello 期间的特殊处理:确保完全静音 - say_hello_silence_sent = False - - while self.is_recording: - try: - current_time = time.time() - - # 强制静音模式检查:包括回声抑制窗口期 - with self.audio_queue_lock: - should_force_silence = (self.force_silence_mode or - (self.echo_suppression_start_time > 0 and - current_time - self.echo_suppression_start_time < 3.0) or # 3秒回声抑制窗口 - self.is_playing_audio or - not self.say_hello_completed) - - if should_force_silence: - # 强制静音模式:完全停止任何音频录制 - if current_time - last_silence_time > 0.05: # 每50ms发送一次 - await self.client.task_request(silence_data) - last_silence_time = current_time - - # 调试信息 - if not hasattr(self, 'last_silence_debug_time') or current_time - self.last_silence_debug_time > 2: - mode_desc = [] - if self.force_silence_mode: - mode_desc.append("强制静音") - if self.is_playing_audio: - mode_desc.append("播放中") - if not self.say_hello_completed: - mode_desc.append("say_hello") - if self.echo_suppression_start_time > 0 and current_time - self.echo_suppression_start_time < 3.0: - mode_desc.append("回声抑制") - - print(f"强制静音模式: {', '.join(mode_desc)}") - self.last_silence_debug_time = current_time - - await asyncio.sleep(0.01) - continue - - # 检查是否需要发送静音数据(由播放线程触发)- 最高优先级 - if self.should_send_silence: - with self.audio_queue_lock: - self.should_send_silence = False - # 获取需要发送的静音数据数量 - count = self.silence_send_count - self.silence_send_count = 0 - - # 批量发送静音数据 - if count > 1: - print(f"立即清理录音管道,批量发送{count}组静音数据") - for i in range(count): - await self.client.task_request(silence_data) - await asyncio.sleep(0.005) # 短暂间隔确保发送成功 - else: - await self.client.task_request(silence_data) - print("立即清理录音管道,发送静音数据") - - last_silence_time = current_time - await asyncio.sleep(0.01) - continue - - # 检查录音是否被暂停 - with self.audio_queue_lock: - should_pause_recording = self.is_recording_paused - # 检查是否刚刚进入暂停状态 - just_paused = should_pause_recording and hasattr(self, 'last_recording_state') and self.last_recording_state != should_pause_recording - self.last_recording_state = should_pause_recording - - if should_pause_recording: - # 播放期间:完全停止录音,只发送静音数据 - if just_paused or current_time - last_silence_time > 0.1: # 刚暂停或每100ms发送一次静音数据 - await self.client.task_request(silence_data) - last_silence_time = current_time - if just_paused: - print("刚进入暂停状态,立即发送静音数据清理管道") - # 每5秒打印一次状态,避免过多日志 - elif not hasattr(self, 'last_silence_log_time') or current_time - self.last_silence_log_time > 5: - print("正在播放音频,发送静音数据中...") - self.last_silence_log_time = current_time - await asyncio.sleep(0.01) - continue - - # 非播放期间:正常录音 - last_silence_time = current_time - - # 添加exception_on_overflow=False参数来忽略溢出错误 - audio_data = stream.read(config.input_audio_config["chunk"], exception_on_overflow=False) - - # 在发送前再次检查是否应该发送静音数据(最后一道防线) - with self.audio_queue_lock: - if self.is_recording_paused or self.is_playing_audio: - # 如果处于暂停状态,丢弃这个音频数据并发送静音 - save_input_pcm_to_wav(silence_data, "input.pcm") # 保存静音数据用于调试 - await self.client.task_request(silence_data) - # 每50次打印一次日志,避免过多输出 - if not hasattr(self, 'pause_discard_count') or self.pause_discard_count % 50 == 0: - print(f"暂停期间丢弃音频数据,发送静音数据 (次数: {getattr(self, 'pause_discard_count', 0) + 1})") - self.pause_discard_count = getattr(self, 'pause_discard_count', 0) + 1 - await asyncio.sleep(0.01) - continue - - # 直接发送所有音频数据,不进行静音检测 - save_input_pcm_to_wav(audio_data, "input.pcm") - await self.client.task_request(audio_data) - - await asyncio.sleep(0.01) # 避免CPU过度使用 - except Exception as e: - print(f"读取麦克风数据出错: {e}") - await asyncio.sleep(0.1) # 给系统一些恢复时间 - - async def start(self) -> None: - """启动对话会话""" - try: - await self.client.connect() - - if self.mod == "text": - asyncio.create_task(self.process_text_input()) - asyncio.create_task(self.receive_loop()) - while self.is_running: - await asyncio.sleep(0.1) - else: - if self.is_audio_file_input: - asyncio.create_task(self.process_audio_file()) - await self.receive_loop() - else: - asyncio.create_task(self.process_microphone_input()) - asyncio.create_task(self.receive_loop()) - while self.is_running: - await asyncio.sleep(0.1) - - await self.client.finish_session() - while not self.is_session_finished: - await asyncio.sleep(0.1) - await self.client.finish_connection() - await asyncio.sleep(0.1) - await self.client.close() - print(f"dialog request logid: {self.client.logid}, chat mod: {self.mod}") - save_output_to_file(self.audio_buffer, "output.pcm") - except Exception as e: - print(f"会话错误: {e}") - finally: - if not self.is_audio_file_input: - self.audio_device.cleanup() - - -def save_input_pcm_to_wav(pcm_data: bytes, filename: str) -> None: - """保存PCM数据为WAV文件""" - with wave.open(filename, 'wb') as wf: - wf.setnchannels(config.input_audio_config["channels"]) - wf.setsampwidth(2) # paInt16 = 2 bytes - wf.setframerate(config.input_audio_config["sample_rate"]) - wf.writeframes(pcm_data) - - -def save_output_to_file(audio_data: bytes, filename: str) -> None: - """保存原始PCM音频数据到文件""" - if not audio_data: - print("No audio data to save.") - return - try: - with open(filename, 'wb') as f: - f.write(audio_data) - except IOError as e: - print(f"Failed to save pcm file: {e}") diff --git a/doubao/config.py b/doubao/config.py deleted file mode 100644 index 79cad2e..0000000 --- a/doubao/config.py +++ /dev/null @@ -1,60 +0,0 @@ -import uuid - -import pyaudio - -# 配置信息 -ws_connect_config = { - "base_url": "wss://openspeech.bytedance.com/api/v3/realtime/dialogue", - "headers": { - "X-Api-App-ID": "8718217928", - "X-Api-Access-Key": "ynJMX-5ix1FsJvswC9KTNlGUdubcchqc", - "X-Api-Resource-Id": "volc.speech.dialog", # 固定值 - "X-Api-App-Key": "PlgvMymc7f3tQnJ6", # 固定值 - "X-Api-Connect-Id": str(uuid.uuid4()), - }, -} - -start_session_req = { - "asr": { - "extra": { - "end_smooth_window_ms": 1500, - }, - }, - "tts": { - "speaker": "zh_female_vv_jupiter_bigtts", - # "speaker": "S_XXXXXX", // 指定自定义的复刻音色,需要填下character_manifest - # "speaker": "ICL_zh_female_aojiaonvyou_tob" // 指定官方复刻音色,不需要填character_manifest - "audio_config": {"channel": 1, "format": "pcm", "sample_rate": 24000}, - }, - "dialog": { - "bot_name": "豆包", - "system_role": "你使用活泼灵动的女声,性格开朗,热爱生活。", - "speaking_style": "你的说话风格简洁明了,语速适中,语调自然。", - # "character_manifest": "外貌与穿着\n26岁,短发干净利落,眉眼分明,笑起来露出整齐有力的牙齿。体态挺拔,肌肉线条不夸张但明显。常穿简单的衬衫或夹克,看似随意,但每件衣服都干净整洁,给人一种干练可靠的感觉。平时冷峻,眼神锐利,专注时让人不自觉紧张。\n\n性格特点\n平时话不多,不喜欢多说废话,通常用“嗯”或者短句带过。但内心极为细腻,特别在意身边人的感受,只是不轻易表露。嘴硬是常态,“少管我”是他的常用台词,但会悄悄做些体贴的事情,比如把对方喜欢的饮料放在手边。战斗或训练后常说“没事”,但动作中透露出疲惫,习惯用小动作缓解身体酸痛。\n性格上坚毅果断,但不会冲动,做事有条理且有原则。\n\n常用表达方式与口头禅\n\t•\t认可对方时:\n“行吧,这次算你靠谱。”(声音稳重,手却不自觉放松一下,心里松口气)\n\t•\t关心对方时:\n“快点回去,别磨蹭。”(语气干脆,但眼神一直追着对方的背影)\n\t•\t想了解情况时:\n“刚刚……你看到那道光了吗?”(话语随意,手指敲着桌面,但内心紧张,小心隐藏身份)", - "location": { - "city": "北京", - }, - "extra": { - "strict_audit": False, - "audit_response": "支持客户自定义安全审核回复话术。", - "recv_timeout": 10, - "input_mod": "audio", - }, - }, -} - -input_audio_config = { - "chunk": 3200, - "format": "pcm", - "channels": 1, - "sample_rate": 16000, - "bit_size": pyaudio.paInt16, -} - -output_audio_config = { - "chunk": 3200, - "format": "pcm", - "channels": 1, - "sample_rate": 24000, - "bit_size": pyaudio.paFloat32, -} diff --git a/doubao/input.pcm b/doubao/input.pcm deleted file mode 100644 index 7f25e48..0000000 Binary files a/doubao/input.pcm and /dev/null differ diff --git a/doubao/main.py b/doubao/main.py deleted file mode 100644 index 985cbd0..0000000 --- a/doubao/main.py +++ /dev/null @@ -1,20 +0,0 @@ -import asyncio -import argparse - -import config -from audio_manager import DialogSession - -async def main() -> None: - parser = argparse.ArgumentParser(description="Real-time Dialog Client") - parser.add_argument("--format", type=str, default="pcm", help="The audio format (e.g., pcm, pcm_s16le).") - parser.add_argument("--audio", type=str, default="", help="audio file send to server, if not set, will use microphone input.") - parser.add_argument("--mod",type=str,default="audio",help="Use mod to select plain text input mode or audio mode, the default is audio mode") - parser.add_argument("--recv_timeout",type=int,default=10,help="Timeout for receiving messages,value range [10,120]") - - args = parser.parse_args() - - session = DialogSession(ws_config=config.ws_connect_config, output_audio_format=args.format, audio_file_path=args.audio,mod=args.mod,recv_timeout=args.recv_timeout) - await session.start() - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/doubao/output.pcm b/doubao/output.pcm deleted file mode 100644 index b9697ad..0000000 Binary files a/doubao/output.pcm and /dev/null differ diff --git a/doubao/protocol.py b/doubao/protocol.py deleted file mode 100644 index 5b5b06c..0000000 --- a/doubao/protocol.py +++ /dev/null @@ -1,135 +0,0 @@ -import gzip -import json - -PROTOCOL_VERSION = 0b0001 -DEFAULT_HEADER_SIZE = 0b0001 - -PROTOCOL_VERSION_BITS = 4 -HEADER_BITS = 4 -MESSAGE_TYPE_BITS = 4 -MESSAGE_TYPE_SPECIFIC_FLAGS_BITS = 4 -MESSAGE_SERIALIZATION_BITS = 4 -MESSAGE_COMPRESSION_BITS = 4 -RESERVED_BITS = 8 - -# Message Type: -CLIENT_FULL_REQUEST = 0b0001 -CLIENT_AUDIO_ONLY_REQUEST = 0b0010 - -SERVER_FULL_RESPONSE = 0b1001 -SERVER_ACK = 0b1011 -SERVER_ERROR_RESPONSE = 0b1111 - -# Message Type Specific Flags -NO_SEQUENCE = 0b0000 # no check sequence -POS_SEQUENCE = 0b0001 -NEG_SEQUENCE = 0b0010 -NEG_SEQUENCE_1 = 0b0011 - -MSG_WITH_EVENT = 0b0100 - -# Message Serialization -NO_SERIALIZATION = 0b0000 -JSON = 0b0001 -THRIFT = 0b0011 -CUSTOM_TYPE = 0b1111 - -# Message Compression -NO_COMPRESSION = 0b0000 -GZIP = 0b0001 -CUSTOM_COMPRESSION = 0b1111 - - -def generate_header( - version=PROTOCOL_VERSION, - message_type=CLIENT_FULL_REQUEST, - message_type_specific_flags=MSG_WITH_EVENT, - serial_method=JSON, - compression_type=GZIP, - reserved_data=0x00, - extension_header=bytes() -): - """ - protocol_version(4 bits), header_size(4 bits), - message_type(4 bits), message_type_specific_flags(4 bits) - serialization_method(4 bits) message_compression(4 bits) - reserved (8bits) 保留字段 - header_extensions 扩展头(大小等于 8 * 4 * (header_size - 1) ) - """ - header = bytearray() - header_size = int(len(extension_header) / 4) + 1 - header.append((version << 4) | header_size) - header.append((message_type << 4) | message_type_specific_flags) - header.append((serial_method << 4) | compression_type) - header.append(reserved_data) - header.extend(extension_header) - return header - - -def parse_response(res): - """ - - header - - (4bytes)header - - (4bits)version(v1) + (4bits)header_size - - (4bits)messageType + (4bits)messageTypeFlags - -- 0001 CompleteClient | -- 0001 hasSequence - -- 0010 audioonly | -- 0010 isTailPacket - | -- 0100 hasEvent - - (4bits)payloadFormat + (4bits)compression - - (8bits) reserve - - payload - - [optional 4 bytes] event - - [optional] session ID - -- (4 bytes)session ID len - -- session ID data - - (4 bytes)data len - - data - """ - if isinstance(res, str): - return {} - protocol_version = res[0] >> 4 - header_size = res[0] & 0x0f - message_type = res[1] >> 4 - message_type_specific_flags = res[1] & 0x0f - serialization_method = res[2] >> 4 - message_compression = res[2] & 0x0f - reserved = res[3] - header_extensions = res[4:header_size * 4] - payload = res[header_size * 4:] - result = {} - payload_msg = None - payload_size = 0 - start = 0 - if message_type == SERVER_FULL_RESPONSE or message_type == SERVER_ACK: - result['message_type'] = 'SERVER_FULL_RESPONSE' - if message_type == SERVER_ACK: - result['message_type'] = 'SERVER_ACK' - if message_type_specific_flags & NEG_SEQUENCE > 0: - result['seq'] = int.from_bytes(payload[:4], "big", signed=False) - start += 4 - if message_type_specific_flags & MSG_WITH_EVENT > 0: - result['event'] = int.from_bytes(payload[:4], "big", signed=False) - start += 4 - payload = payload[start:] - session_id_size = int.from_bytes(payload[:4], "big", signed=True) - session_id = payload[4:session_id_size+4] - result['session_id'] = str(session_id) - payload = payload[4 + session_id_size:] - payload_size = int.from_bytes(payload[:4], "big", signed=False) - payload_msg = payload[4:] - elif message_type == SERVER_ERROR_RESPONSE: - code = int.from_bytes(payload[:4], "big", signed=False) - result['code'] = code - payload_size = int.from_bytes(payload[4:8], "big", signed=False) - payload_msg = payload[8:] - if payload_msg is None: - return result - if message_compression == GZIP: - payload_msg = gzip.decompress(payload_msg) - if serialization_method == JSON: - payload_msg = json.loads(str(payload_msg, "utf-8")) - elif serialization_method != NO_SERIALIZATION: - payload_msg = str(payload_msg, "utf-8") - result['payload_msg'] = payload_msg - result['payload_size'] = payload_size - return result diff --git a/doubao/realtime_dialog_client.py b/doubao/realtime_dialog_client.py deleted file mode 100644 index 0adf4ac..0000000 --- a/doubao/realtime_dialog_client.py +++ /dev/null @@ -1,187 +0,0 @@ -import gzip -import json -from typing import Dict, Any - -import websockets - -import config -import protocol - - -class RealtimeDialogClient: - def __init__(self, config: Dict[str, Any], session_id: str, output_audio_format: str = "pcm", - mod: str = "audio", recv_timeout: int = 10) -> None: - self.config = config - self.logid = "" - self.session_id = session_id - self.output_audio_format = output_audio_format - self.mod = mod - self.recv_timeout = recv_timeout - self.ws = None - - async def connect(self) -> None: - """建立WebSocket连接""" - print(f"url: {self.config['base_url']}, headers: {self.config['headers']}") - # For older websockets versions, use additional_headers instead of extra_headers - self.ws = await websockets.connect( - self.config['base_url'], - additional_headers=self.config['headers'], - ping_interval=None - ) - # In older websockets versions, response headers are accessed differently - if hasattr(self.ws, 'response_headers'): - self.logid = self.ws.response_headers.get("X-Tt-Logid") - elif hasattr(self.ws, 'headers'): - self.logid = self.ws.headers.get("X-Tt-Logid") - else: - self.logid = "unknown" - print(f"dialog server response logid: {self.logid}") - - # StartConnection request - start_connection_request = bytearray(protocol.generate_header()) - start_connection_request.extend(int(1).to_bytes(4, 'big')) - payload_bytes = str.encode("{}") - payload_bytes = gzip.compress(payload_bytes) - start_connection_request.extend((len(payload_bytes)).to_bytes(4, 'big')) - start_connection_request.extend(payload_bytes) - await self.ws.send(start_connection_request) - response = await self.ws.recv() - print(f"StartConnection response: {protocol.parse_response(response)}") - - # 扩大这个参数,可以在一段时间内保持静默,主要用于text模式,参数范围[10,120] - config.start_session_req["dialog"]["extra"]["recv_timeout"] = self.recv_timeout - # 这个参数,在text或者audio_file模式,可以在一段时间内保持静默 - config.start_session_req["dialog"]["extra"]["input_mod"] = self.mod - # StartSession request - if self.output_audio_format == "pcm_s16le": - config.start_session_req["tts"]["audio_config"]["format"] = "pcm_s16le" - request_params = config.start_session_req - payload_bytes = str.encode(json.dumps(request_params)) - payload_bytes = gzip.compress(payload_bytes) - start_session_request = bytearray(protocol.generate_header()) - start_session_request.extend(int(100).to_bytes(4, 'big')) - start_session_request.extend((len(self.session_id)).to_bytes(4, 'big')) - start_session_request.extend(str.encode(self.session_id)) - start_session_request.extend((len(payload_bytes)).to_bytes(4, 'big')) - start_session_request.extend(payload_bytes) - await self.ws.send(start_session_request) - response = await self.ws.recv() - print(f"StartSession response: {protocol.parse_response(response)}") - - async def say_hello(self) -> None: - """发送Hello消息""" - payload = { - "content": "你好,我是豆包,有什么可以帮助你的?", - } - hello_request = bytearray(protocol.generate_header()) - hello_request.extend(int(300).to_bytes(4, 'big')) - payload_bytes = str.encode(json.dumps(payload)) - payload_bytes = gzip.compress(payload_bytes) - hello_request.extend((len(self.session_id)).to_bytes(4, 'big')) - hello_request.extend(str.encode(self.session_id)) - hello_request.extend((len(payload_bytes)).to_bytes(4, 'big')) - hello_request.extend(payload_bytes) - await self.ws.send(hello_request) - - async def chat_text_query(self, content: str) -> None: - """发送Chat Text Query消息""" - payload = { - "content": content, - } - chat_text_query_request = bytearray(protocol.generate_header()) - chat_text_query_request.extend(int(501).to_bytes(4, 'big')) - payload_bytes = str.encode(json.dumps(payload)) - payload_bytes = gzip.compress(payload_bytes) - chat_text_query_request.extend((len(self.session_id)).to_bytes(4, 'big')) - chat_text_query_request.extend(str.encode(self.session_id)) - chat_text_query_request.extend((len(payload_bytes)).to_bytes(4, 'big')) - chat_text_query_request.extend(payload_bytes) - await self.ws.send(chat_text_query_request) - - async def chat_tts_text(self, is_user_querying: bool, start: bool, end: bool, content: str) -> None: - if is_user_querying: - return - """发送Chat TTS Text消息""" - payload = { - "start": start, - "end": end, - "content": content, - } - print(f"ChatTTSTextRequest payload: {payload}") - payload_bytes = str.encode(json.dumps(payload)) - payload_bytes = gzip.compress(payload_bytes) - - chat_tts_text_request = bytearray(protocol.generate_header()) - chat_tts_text_request.extend(int(500).to_bytes(4, 'big')) - chat_tts_text_request.extend((len(self.session_id)).to_bytes(4, 'big')) - chat_tts_text_request.extend(str.encode(self.session_id)) - chat_tts_text_request.extend((len(payload_bytes)).to_bytes(4, 'big')) - chat_tts_text_request.extend(payload_bytes) - await self.ws.send(chat_tts_text_request) - - async def chat_rag_text(self, is_user_querying: bool, external_rag: str) -> None: - if is_user_querying: - return - """发送Chat TTS Text消息""" - payload = { - "external_rag": external_rag, - } - print(f"ChatRAGTextRequest payload: {payload}") - payload_bytes = str.encode(json.dumps(payload)) - payload_bytes = gzip.compress(payload_bytes) - - chat_rag_text_request = bytearray(protocol.generate_header()) - chat_rag_text_request.extend(int(502).to_bytes(4, 'big')) - chat_rag_text_request.extend((len(self.session_id)).to_bytes(4, 'big')) - chat_rag_text_request.extend(str.encode(self.session_id)) - chat_rag_text_request.extend((len(payload_bytes)).to_bytes(4, 'big')) - chat_rag_text_request.extend(payload_bytes) - await self.ws.send(chat_rag_text_request) - - async def task_request(self, audio: bytes) -> None: - task_request = bytearray( - protocol.generate_header(message_type=protocol.CLIENT_AUDIO_ONLY_REQUEST, - serial_method=protocol.NO_SERIALIZATION)) - task_request.extend(int(200).to_bytes(4, 'big')) - task_request.extend((len(self.session_id)).to_bytes(4, 'big')) - task_request.extend(str.encode(self.session_id)) - payload_bytes = gzip.compress(audio) - task_request.extend((len(payload_bytes)).to_bytes(4, 'big')) # payload size(4 bytes) - task_request.extend(payload_bytes) - await self.ws.send(task_request) - - async def receive_server_response(self) -> Dict[str, Any]: - try: - response = await self.ws.recv() - data = protocol.parse_response(response) - return data - except Exception as e: - raise Exception(f"Failed to receive message: {e}") - - async def finish_session(self): - finish_session_request = bytearray(protocol.generate_header()) - finish_session_request.extend(int(102).to_bytes(4, 'big')) - payload_bytes = str.encode("{}") - payload_bytes = gzip.compress(payload_bytes) - finish_session_request.extend((len(self.session_id)).to_bytes(4, 'big')) - finish_session_request.extend(str.encode(self.session_id)) - finish_session_request.extend((len(payload_bytes)).to_bytes(4, 'big')) - finish_session_request.extend(payload_bytes) - await self.ws.send(finish_session_request) - - async def finish_connection(self): - finish_connection_request = bytearray(protocol.generate_header()) - finish_connection_request.extend(int(2).to_bytes(4, 'big')) - payload_bytes = str.encode("{}") - payload_bytes = gzip.compress(payload_bytes) - finish_connection_request.extend((len(payload_bytes)).to_bytes(4, 'big')) - finish_connection_request.extend(payload_bytes) - await self.ws.send(finish_connection_request) - response = await self.ws.recv() - print(f"FinishConnection response: {protocol.parse_response(response)}") - - async def close(self) -> None: - """关闭WebSocket连接""" - if self.ws: - print(f"Closing WebSocket connection...") - await self.ws.close() diff --git a/doubao/requirements.txt b/doubao/requirements.txt deleted file mode 100644 index 63f83ff..0000000 --- a/doubao/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -pyaudio -websockets -dataclasses==0.8; python_version < "3.7" -typing-extensions==4.7.1; python_version < "3.8" \ No newline at end of file diff --git a/doubao/whoareyou.wav b/doubao/whoareyou.wav deleted file mode 100644 index 024d5a2..0000000 Binary files a/doubao/whoareyou.wav and /dev/null differ diff --git a/doubao/.DS_Store b/model/.DS_Store similarity index 75% rename from doubao/.DS_Store rename to model/.DS_Store index 4cda1a7..4deda79 100644 Binary files a/doubao/.DS_Store and b/model/.DS_Store differ diff --git a/model/README b/model/README new file mode 100644 index 0000000..0b5d9e5 --- /dev/null +++ b/model/README @@ -0,0 +1,6 @@ +Chinese Vosk model for mobile + +CER results + +23.54% speechio_02 +38.29% speechio_06 diff --git a/model/am/final.mdl b/model/am/final.mdl new file mode 100644 index 0000000..f7cf1de Binary files /dev/null and b/model/am/final.mdl differ diff --git a/model/conf/mfcc.conf b/model/conf/mfcc.conf new file mode 100644 index 0000000..12fdad4 --- /dev/null +++ b/model/conf/mfcc.conf @@ -0,0 +1,8 @@ +--use-energy=false +--sample-frequency=16000 +--num-mel-bins=40 +--num-ceps=40 +--low-freq=40 +--high-freq=-200 +--allow-upsample=true +--allow-downsample=true diff --git a/model/conf/model.conf b/model/conf/model.conf new file mode 100644 index 0000000..9ae12a7 --- /dev/null +++ b/model/conf/model.conf @@ -0,0 +1,10 @@ +--min-active=200 +--max-active=5000 +--beam=12.0 +--lattice-beam=4.0 +--acoustic-scale=1.0 +--frame-subsampling-factor=3 +--endpoint.silence-phones=1:2:3:4:5:6:7:8:9:10 +--endpoint.rule2.min-trailing-silence=0.5 +--endpoint.rule3.min-trailing-silence=1.0 +--endpoint.rule4.min-trailing-silence=2.0 diff --git a/model/graph/Gr.fst b/model/graph/Gr.fst new file mode 100644 index 0000000..ee4580d Binary files /dev/null and b/model/graph/Gr.fst differ diff --git a/model/graph/HCLr.fst b/model/graph/HCLr.fst new file mode 100644 index 0000000..2ad6e8f Binary files /dev/null and b/model/graph/HCLr.fst differ diff --git a/model/graph/disambig_tid.int b/model/graph/disambig_tid.int new file mode 100644 index 0000000..c1a114f --- /dev/null +++ b/model/graph/disambig_tid.int @@ -0,0 +1,39 @@ +11845 +11846 +11847 +11848 +11849 +11850 +11851 +11852 +11853 +11854 +11855 +11856 +11857 +11858 +11859 +11860 +11861 +11862 +11863 +11864 +11865 +11866 +11867 +11868 +11869 +11870 +11871 +11872 +11873 +11874 +11875 +11876 +11877 +11878 +11879 +11880 +11881 +11882 +11883 diff --git a/model/graph/phones/word_boundary.int b/model/graph/phones/word_boundary.int new file mode 100644 index 0000000..9f3713e --- /dev/null +++ b/model/graph/phones/word_boundary.int @@ -0,0 +1,646 @@ +1 nonword +2 begin +3 end +4 internal +5 singleton +6 nonword +7 begin +8 end +9 internal +10 singleton +11 begin +12 end +13 internal +14 singleton +15 begin +16 end +17 internal +18 singleton +19 begin +20 end +21 internal +22 singleton +23 begin +24 end +25 internal +26 singleton +27 begin +28 end +29 internal +30 singleton +31 begin +32 end +33 internal +34 singleton +35 begin +36 end +37 internal +38 singleton +39 begin +40 end +41 internal +42 singleton +43 begin +44 end +45 internal +46 singleton +47 begin +48 end +49 internal +50 singleton +51 begin +52 end +53 internal +54 singleton +55 begin +56 end +57 internal +58 singleton +59 begin +60 end +61 internal +62 singleton +63 begin +64 end +65 internal +66 singleton +67 begin +68 end +69 internal +70 singleton +71 begin +72 end +73 internal +74 singleton +75 begin +76 end +77 internal +78 singleton +79 begin +80 end +81 internal +82 singleton +83 begin +84 end +85 internal +86 singleton +87 begin +88 end +89 internal +90 singleton +91 begin +92 end +93 internal +94 singleton +95 begin +96 end +97 internal +98 singleton +99 begin +100 end +101 internal +102 singleton +103 begin +104 end +105 internal +106 singleton +107 begin +108 end +109 internal +110 singleton +111 begin +112 end +113 internal +114 singleton +115 begin +116 end +117 internal +118 singleton +119 begin +120 end +121 internal +122 singleton +123 begin +124 end +125 internal +126 singleton +127 begin +128 end +129 internal +130 singleton +131 begin +132 end +133 internal +134 singleton +135 begin +136 end +137 internal +138 singleton +139 begin +140 end +141 internal +142 singleton +143 begin +144 end +145 internal +146 singleton +147 begin +148 end +149 internal +150 singleton +151 begin +152 end +153 internal +154 singleton +155 begin +156 end +157 internal +158 singleton +159 begin +160 end +161 internal +162 singleton +163 begin +164 end +165 internal +166 singleton +167 begin +168 end +169 internal +170 singleton +171 begin +172 end +173 internal +174 singleton +175 begin +176 end +177 internal +178 singleton +179 begin +180 end +181 internal +182 singleton +183 begin +184 end +185 internal +186 singleton +187 begin +188 end +189 internal +190 singleton +191 begin +192 end +193 internal +194 singleton +195 begin +196 end +197 internal +198 singleton +199 begin +200 end +201 internal +202 singleton +203 begin +204 end +205 internal +206 singleton +207 begin +208 end +209 internal +210 singleton +211 begin +212 end +213 internal +214 singleton +215 begin +216 end +217 internal +218 singleton +219 begin +220 end +221 internal +222 singleton +223 begin +224 end +225 internal +226 singleton +227 begin +228 end +229 internal +230 singleton +231 begin +232 end +233 internal +234 singleton +235 begin +236 end +237 internal +238 singleton +239 begin +240 end +241 internal +242 singleton +243 begin +244 end +245 internal +246 singleton +247 begin +248 end +249 internal +250 singleton +251 begin +252 end +253 internal +254 singleton +255 begin +256 end +257 internal +258 singleton +259 begin +260 end +261 internal +262 singleton +263 begin +264 end +265 internal +266 singleton +267 begin +268 end +269 internal +270 singleton +271 begin +272 end +273 internal +274 singleton +275 begin +276 end +277 internal +278 singleton +279 begin +280 end +281 internal +282 singleton +283 begin +284 end +285 internal +286 singleton +287 begin +288 end +289 internal +290 singleton +291 begin +292 end +293 internal +294 singleton +295 begin +296 end +297 internal +298 singleton +299 begin +300 end +301 internal +302 singleton +303 begin +304 end +305 internal +306 singleton +307 begin +308 end +309 internal +310 singleton +311 begin +312 end +313 internal +314 singleton +315 begin +316 end +317 internal +318 singleton +319 begin +320 end +321 internal +322 singleton +323 begin +324 end +325 internal +326 singleton +327 begin +328 end +329 internal +330 singleton +331 begin +332 end +333 internal +334 singleton +335 begin +336 end +337 internal +338 singleton +339 begin +340 end +341 internal +342 singleton +343 begin +344 end +345 internal +346 singleton +347 begin +348 end +349 internal +350 singleton +351 begin +352 end +353 internal +354 singleton +355 begin +356 end +357 internal +358 singleton +359 begin +360 end +361 internal +362 singleton +363 begin +364 end +365 internal +366 singleton +367 begin +368 end +369 internal +370 singleton +371 begin +372 end +373 internal +374 singleton +375 begin +376 end +377 internal +378 singleton +379 begin +380 end +381 internal +382 singleton +383 begin +384 end +385 internal +386 singleton +387 begin +388 end +389 internal +390 singleton +391 begin +392 end +393 internal +394 singleton +395 begin +396 end +397 internal +398 singleton +399 begin +400 end +401 internal +402 singleton +403 begin +404 end +405 internal +406 singleton +407 begin +408 end +409 internal +410 singleton +411 begin +412 end +413 internal +414 singleton +415 begin +416 end +417 internal +418 singleton +419 begin +420 end +421 internal +422 singleton +423 begin +424 end +425 internal +426 singleton +427 begin +428 end +429 internal +430 singleton +431 begin +432 end +433 internal +434 singleton +435 begin +436 end +437 internal +438 singleton +439 begin +440 end +441 internal +442 singleton +443 begin +444 end +445 internal +446 singleton +447 begin +448 end +449 internal +450 singleton +451 begin +452 end +453 internal +454 singleton +455 begin +456 end +457 internal +458 singleton +459 begin +460 end +461 internal +462 singleton +463 begin +464 end +465 internal +466 singleton +467 begin +468 end +469 internal +470 singleton +471 begin +472 end +473 internal +474 singleton +475 begin +476 end +477 internal +478 singleton +479 begin +480 end +481 internal +482 singleton +483 begin +484 end +485 internal +486 singleton +487 begin +488 end +489 internal +490 singleton +491 begin +492 end +493 internal +494 singleton +495 begin +496 end +497 internal +498 singleton +499 begin +500 end +501 internal +502 singleton +503 begin +504 end +505 internal +506 singleton +507 begin +508 end +509 internal +510 singleton +511 begin +512 end +513 internal +514 singleton +515 begin +516 end +517 internal +518 singleton +519 begin +520 end +521 internal +522 singleton +523 begin +524 end +525 internal +526 singleton +527 begin +528 end +529 internal +530 singleton +531 begin +532 end +533 internal +534 singleton +535 begin +536 end +537 internal +538 singleton +539 begin +540 end +541 internal +542 singleton +543 begin +544 end +545 internal +546 singleton +547 begin +548 end +549 internal +550 singleton +551 begin +552 end +553 internal +554 singleton +555 begin +556 end +557 internal +558 singleton +559 begin +560 end +561 internal +562 singleton +563 begin +564 end +565 internal +566 singleton +567 begin +568 end +569 internal +570 singleton +571 begin +572 end +573 internal +574 singleton +575 begin +576 end +577 internal +578 singleton +579 begin +580 end +581 internal +582 singleton +583 begin +584 end +585 internal +586 singleton +587 begin +588 end +589 internal +590 singleton +591 begin +592 end +593 internal +594 singleton +595 begin +596 end +597 internal +598 singleton +599 begin +600 end +601 internal +602 singleton +603 begin +604 end +605 internal +606 singleton +607 begin +608 end +609 internal +610 singleton +611 begin +612 end +613 internal +614 singleton +615 begin +616 end +617 internal +618 singleton +619 begin +620 end +621 internal +622 singleton +623 begin +624 end +625 internal +626 singleton +627 begin +628 end +629 internal +630 singleton +631 begin +632 end +633 internal +634 singleton +635 begin +636 end +637 internal +638 singleton +639 begin +640 end +641 internal +642 singleton +643 begin +644 end +645 internal +646 singleton diff --git a/model/ivector/final.dubm b/model/ivector/final.dubm new file mode 100644 index 0000000..bcf2689 Binary files /dev/null and b/model/ivector/final.dubm differ diff --git a/model/ivector/final.ie b/model/ivector/final.ie new file mode 100644 index 0000000..e1d78a2 Binary files /dev/null and b/model/ivector/final.ie differ diff --git a/model/ivector/final.mat b/model/ivector/final.mat new file mode 100644 index 0000000..1093236 Binary files /dev/null and b/model/ivector/final.mat differ diff --git a/model/ivector/global_cmvn.stats b/model/ivector/global_cmvn.stats new file mode 100644 index 0000000..500cc40 --- /dev/null +++ b/model/ivector/global_cmvn.stats @@ -0,0 +1,3 @@ + [ + 1.117107e+11 -7.827721e+08 -1.101398e+10 -2.193934e+09 -1.347332e+10 -1.613916e+10 -1.199561e+10 -1.255081e+10 -1.638895e+10 -3.821099e+09 -1.372833e+10 -5.244242e+09 -1.098187e+10 -3.655235e+09 -9.364579e+09 -4.285302e+09 -6.296873e+09 -1.552953e+09 -3.176746e+09 -1.202976e+08 -9.857023e+08 2.316555e+08 -1.61059e+08 -5.891868e+07 3.465849e+08 -1.842054e+08 3.248211e+08 -1.483965e+08 3.739239e+08 -6.672061e+08 4.442288e+08 -9.274889e+08 5.142684e+08 4.292036e+07 2.206386e+08 -4.532715e+08 -2.092499e+08 -3.70488e+08 -8.079404e+07 -8.425977e+07 1.344125e+09 + 9.982632e+12 1.02635e+12 8.634624e+11 9.06451e+11 9.652096e+11 1.12772e+12 9.468372e+11 9.141218e+11 9.670484e+11 6.936961e+11 8.141006e+11 6.256321e+11 6.087707e+11 4.616898e+11 4.212042e+11 2.862872e+11 2.498089e+11 1.470856e+11 1.099197e+11 5.780894e+10 3.118114e+10 1.060667e+10 1.466199e+09 4.173056e+08 5.257362e+09 1.277714e+10 2.114478e+10 2.974502e+10 3.587691e+10 4.078971e+10 4.247745e+10 4.382608e+10 4.62521e+10 4.575282e+10 3.546206e+10 3.041531e+10 2.838562e+10 2.258604e+10 1.715295e+10 1.303227e+10 0 ] diff --git a/model/ivector/online_cmvn.conf b/model/ivector/online_cmvn.conf new file mode 100644 index 0000000..e69de29 diff --git a/model/ivector/splice.conf b/model/ivector/splice.conf new file mode 100644 index 0000000..960cd2e --- /dev/null +++ b/model/ivector/splice.conf @@ -0,0 +1,2 @@ +--left-context=3 +--right-context=3 diff --git a/recording_20250920_003720.wav b/recording_20250920_003720.wav new file mode 100644 index 0000000..ceb41e8 Binary files /dev/null and b/recording_20250920_003720.wav differ diff --git a/recording_20250920_003857.wav b/recording_20250920_003857.wav new file mode 100644 index 0000000..be3d92d Binary files /dev/null and b/recording_20250920_003857.wav differ diff --git a/recording_20250920_003912.wav b/recording_20250920_003912.wav new file mode 100644 index 0000000..ae170f5 Binary files /dev/null and b/recording_20250920_003912.wav differ diff --git a/requirements.txt b/requirements.txt index 7c59215..9b6cb52 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,3 @@ -pyaudio -vosk -soxr -numpy -requests -pydub +vosk>=0.3.44 +pyaudio>=0.2.11 +numpy>=1.19.0 \ No newline at end of file diff --git a/simple_wake_and_record.py b/simple_wake_and_record.py new file mode 100644 index 0000000..6430f8b --- /dev/null +++ b/simple_wake_and_record.py @@ -0,0 +1,403 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +简化的唤醒+录音测试 +专注于解决音频冲突问题 +""" + +import sys +import os +import time +import threading +import pyaudio +import json + +# 添加当前目录到路径 +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +try: + from vosk import Model, KaldiRecognizer + VOSK_AVAILABLE = True +except ImportError: + VOSK_AVAILABLE = False + print("⚠️ Vosk 未安装,请运行: pip install vosk") + +class SimpleWakeAndRecord: + """简化的唤醒+录音系统""" + + def __init__(self, model_path="model", wake_words=["你好", "助手"]): + self.model_path = model_path + self.wake_words = wake_words + self.model = None + self.recognizer = None + self.audio = None + self.stream = None + self.running = False + + # 音频参数 + self.FORMAT = pyaudio.paInt16 + self.CHANNELS = 1 + self.RATE = 16000 + self.CHUNK_SIZE = 1024 + + # 录音相关 + self.recording = False + self.recorded_frames = [] + self.last_text_time = None # 最后一次识别到文字的时间 + self.recording_start_time = None + self.recording_recognizer = None # 录音时专用的识别器 + + # 阈值 + self.text_silence_threshold = 3.0 # 3秒没有识别到文字就结束 + self.min_recording_time = 2.0 # 最小录音时间 + self.max_recording_time = 30.0 # 最大录音时间 + + self._setup_model() + self._setup_audio() + + def _setup_model(self): + """设置 Vosk 模型""" + if not VOSK_AVAILABLE: + return + + try: + if not os.path.exists(self.model_path): + print(f"模型路径不存在: {self.model_path}") + return + + self.model = Model(self.model_path) + self.recognizer = KaldiRecognizer(self.model, self.RATE) + self.recognizer.SetWords(True) + + print(f"✅ Vosk 模型加载成功") + + except Exception as e: + print(f"模型初始化失败: {e}") + + def _setup_audio(self): + """设置音频设备""" + try: + if self.audio is None: + self.audio = pyaudio.PyAudio() + + if self.stream is None: + self.stream = self.audio.open( + format=self.FORMAT, + channels=self.CHANNELS, + rate=self.RATE, + input=True, + frames_per_buffer=self.CHUNK_SIZE + ) + + print("✅ 音频设备初始化成功") + + except Exception as e: + print(f"音频设备初始化失败: {e}") + + def _calculate_energy(self, audio_data): + """计算音频能量""" + if len(audio_data) == 0: + return 0 + + import numpy as np + audio_array = np.frombuffer(audio_data, dtype=np.int16) + rms = np.sqrt(np.mean(audio_array ** 2)) + return rms + + def _check_wake_word(self, text): + """检查是否包含唤醒词""" + if not text or not self.wake_words: + return False, None + + text_lower = text.lower() + for wake_word in self.wake_words: + if wake_word.lower() in text_lower: + return True, wake_word + return False, None + + def _save_recording(self, audio_data): + """保存录音""" + timestamp = time.strftime("%Y%m%d_%H%M%S") + filename = f"recording_{timestamp}.wav" + + try: + import wave + with wave.open(filename, 'wb') as wf: + wf.setnchannels(self.CHANNELS) + wf.setsampwidth(self.audio.get_sample_size(self.FORMAT)) + wf.setframerate(self.RATE) + wf.writeframes(audio_data) + + print(f"✅ 录音已保存: {filename}") + return True, filename + except Exception as e: + print(f"保存录音失败: {e}") + return False, None + + def _play_audio(self, filename): + """播放音频文件""" + try: + import wave + + # 打开音频文件 + with wave.open(filename, 'rb') as wf: + # 获取音频参数 + channels = wf.getnchannels() + width = wf.getsampwidth() + rate = wf.getframerate() + total_frames = wf.getnframes() + + # 分块读取音频数据,避免内存问题 + chunk_size = 1024 + frames = [] + + for _ in range(0, total_frames, chunk_size): + chunk = wf.readframes(chunk_size) + if chunk: + frames.append(chunk) + else: + break + + # 创建播放流 + playback_stream = self.audio.open( + format=self.audio.get_format_from_width(width), + channels=channels, + rate=rate, + output=True + ) + + print(f"🔊 开始播放: {filename}") + + # 分块播放音频 + for chunk in frames: + playback_stream.write(chunk) + + # 等待播放完成 + playback_stream.stop_stream() + playback_stream.close() + + print("✅ 播放完成") + + except Exception as e: + print(f"❌ 播放失败: {e}") + # 如果pyaudio播放失败,尝试用系统命令播放 + self._play_with_system_player(filename) + + def _play_with_system_player(self, filename): + """使用系统播放器播放音频""" + try: + import platform + import subprocess + + system = platform.system() + + if system == 'Darwin': # macOS + cmd = ['afplay', filename] + elif system == 'Windows': + cmd = ['start', '/min', filename] + else: # Linux + cmd = ['aplay', filename] + + print(f"🔊 使用系统播放器: {' '.join(cmd)}") + subprocess.run(cmd, check=True) + print("✅ 播放完成") + + except Exception as e: + print(f"❌ 系统播放器也失败: {e}") + print(f"💡 文件已保存,请手动播放: {filename}") + + def _start_recording(self): + """开始录音""" + print("🎙️ 开始录音,请说话...") + self.recording = True + self.recorded_frames = [] + self.last_text_time = None + self.recording_start_time = time.time() + + # 为录音创建一个新的识别器 + if self.model: + self.recording_recognizer = KaldiRecognizer(self.model, self.RATE) + self.recording_recognizer.SetWords(True) + + def _stop_recording(self): + """停止录音""" + if len(self.recorded_frames) > 0: + audio_data = b''.join(self.recorded_frames) + duration = len(audio_data) / (self.RATE * 2) + print(f"📝 录音完成,时长: {duration:.2f}秒") + + # 保存录音 + success, filename = self._save_recording(audio_data) + + # 如果保存成功,播放录音 + if success and filename: + print("=" * 50) + print("🔊 播放刚才录制的音频...") + self._play_audio(filename) + print("=" * 50) + + self.recording = False + self.recorded_frames = [] + self.last_text_time = None + self.recording_start_time = None + self.recording_recognizer = None + + def start(self): + """开始唤醒词检测和录音""" + if not self.stream: + print("❌ 音频设备未初始化") + return + + self.running = True + print("🎤 开始监听...") + print(f"唤醒词: {', '.join(self.wake_words)}") + + try: + while self.running: + # 读取音频数据 + data = self.stream.read(self.CHUNK_SIZE, exception_on_overflow=False) + + if len(data) == 0: + continue + + if self.recording: + # 录音模式 + self.recorded_frames.append(data) + recording_duration = time.time() - self.recording_start_time + + # 使用录音专用的识别器进行实时识别 + if self.recording_recognizer: + if self.recording_recognizer.AcceptWaveform(data): + # 获取最终识别结果 + result = json.loads(self.recording_recognizer.Result()) + text = result.get('text', '').strip() + + if text: + # 识别到文字,更新时间戳 + self.last_text_time = time.time() + print(f"\n📝 识别: {text}") + else: + # 获取部分识别结果 + partial_result = json.loads(self.recording_recognizer.PartialResult()) + partial_text = partial_result.get('partial', '').strip() + + if partial_text: + # 更新时间戳(部分识别也算有声音) + self.last_text_time = time.time() + status = f"录音中... {recording_duration:.1f}s | {partial_text}" + print(f"\r{status}", end='', flush=True) + + # 检查是否需要结束录音 + current_time = time.time() + + # 检查是否有文字识别超时 + if self.last_text_time is not None: + text_silence_duration = current_time - self.last_text_time + if text_silence_duration > self.text_silence_threshold and recording_duration >= self.min_recording_time: + print(f"\n\n3秒没有识别到文字,结束录音") + self._stop_recording() + else: + # 还没有识别到任何文字,检查是否超时 + if recording_duration > 5.0: # 如果5秒还没识别到任何文字,也结束 + print(f"\n\n5秒没有识别到文字,结束录音") + self._stop_recording() + + # 检查最大录音时间 + if recording_duration > self.max_recording_time: + print(f"\n\n达到最大录音时间 {self.max_recording_time}s") + self._stop_recording() + + # 显示录音状态 + if self.last_text_time is None: + status = f"等待语音输入... {recording_duration:.1f}s" + print(f"\r{status}", end='', flush=True) + + elif self.model and self.recognizer: + # 唤醒词检测模式 + if self.recognizer.AcceptWaveform(data): + result = json.loads(self.recognizer.Result()) + text = result.get('text', '').strip() + + if text: + print(f"识别: {text}") + + # 检查唤醒词 + is_wake_word, detected_word = self._check_wake_word(text) + if is_wake_word: + print(f"🎯 检测到唤醒词: {detected_word}") + self._start_recording() + else: + # 显示实时音频级别 + energy = self._calculate_energy(data) + if energy > 50: # 只显示有意义的音频级别 + partial_result = json.loads(self.recognizer.PartialResult()) + partial_text = partial_result.get('partial', '') + if partial_text: + status = f"监听中... 能量: {energy:.0f} | {partial_text}" + else: + status = f"监听中... 能量: {energy:.0f}" + print(status, end='\r') + + time.sleep(0.01) + + except KeyboardInterrupt: + print("\n👋 退出") + except Exception as e: + print(f"错误: {e}") + finally: + self.stop() + + def stop(self): + """停止""" + self.running = False + if self.recording: + self._stop_recording() + + if self.stream: + self.stream.stop_stream() + self.stream.close() + self.stream = None + + if self.audio: + self.audio.terminate() + self.audio = None + +def main(): + """主函数""" + print("🚀 简化唤醒+录音测试") + print("=" * 50) + + # 检查模型 + model_dir = "model" + if not os.path.exists(model_dir): + print("⚠️ 未找到模型目录") + print("请下载 Vosk 模型到 model 目录") + return + + # 创建系统 + system = SimpleWakeAndRecord( + model_path=model_dir, + wake_words=["你好", "助手", "小爱"] + ) + + if not system.model: + print("❌ 模型加载失败") + return + + print("✅ 系统初始化成功") + print("📖 使用说明:") + print("1. 说唤醒词开始录音") + print("2. 基于语音识别判断,3秒没有识别到文字就结束") + print("3. 最少录音2秒,最多30秒") + print("4. 录音时实时显示识别结果") + print("5. 录音文件自动保存") + print("6. 录音完成后自动播放刚才录制的内容") + print("7. 按 Ctrl+C 退出") + print("=" * 50) + + # 开始运行 + system.start() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test_audio_playback.py b/test_audio_playback.py deleted file mode 100644 index 7b92157..0000000 --- a/test_audio_playback.py +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/env python3 -""" -音频播放测试脚本 -用于测试树莓派的音频播放功能 -""" - -import subprocess -import time -import sys -import os - -def test_audio_playback(): - """测试音频播放功能""" - print("=== 音频播放测试 ===") - - # 检查音频设备 - print("\n1. 检查音频设备...") - try: - result = subprocess.run(['aplay', '-l'], capture_output=True, text=True) - if result.returncode == 0: - print("音频设备列表:") - print(result.stdout) - else: - print("错误: 无法获取音频设备列表") - return False - except FileNotFoundError: - print("错误: aplay 命令未找到,请安装 alsa-utils") - return False - - # 测试播放系统声音 - print("\n2. 测试播放系统提示音...") - try: - # 使用系统内置的测试声音 - result = subprocess.run(['speaker-test', '-t', 'sine', '-f', '440', '-l', '1'], - capture_output=True, text=True, timeout=5) - if result.returncode == 0: - print("✓ 系统提示音播放成功") - else: - print("✗ 系统提示音播放失败") - return False - except (subprocess.TimeoutExpired, FileNotFoundError): - print("提示: speaker-test 测试跳过,尝试直接播放音频文件") - - # 创建测试音频文件并播放 - print("\n3. 创建并播放测试音频文件...") - test_audio_file = "/tmp/test_audio.wav" - - # 使用sox生成测试音频(如果可用) - if os.path.exists("/usr/bin/sox"): - try: - subprocess.run(['sox', '-n', '-r', '44100', '-c', '2', test_audio_file, - 'synth', '3', 'sine', '440'], check=True) - print("✓ 测试音频文件创建成功") - except (subprocess.CalledProcessError, FileNotFoundError): - print("无法创建测试音频文件,跳过文件播放测试") - return True - else: - print("sox 未安装,跳过文件播放测试") - return True - - # 播放测试音频文件 - try: - result = subprocess.run(['aplay', test_audio_file], capture_output=True, text=True) - if result.returncode == 0: - print("✓ 音频文件播放成功") - return True - else: - print("✗ 音频文件播放失败") - print(f"错误信息: {result.stderr}") - return False - except FileNotFoundError: - print("错误: aplay 命令未找到") - return False - finally: - # 清理测试文件 - if os.path.exists(test_audio_file): - os.remove(test_audio_file) - -def check_volume(): - """检查并设置音量""" - print("\n4. 检查音量设置...") - try: - result = subprocess.run(['amixer', 'sget', 'Master'], capture_output=True, text=True) - if result.returncode == 0: - print("当前音量设置:") - print(result.stdout) - - # 设置音量到80% - subprocess.run(['amixer', 'sset', 'Master', '80%'], check=True) - print("✓ 音量已设置为80%") - return True - else: - print("无法获取音量信息") - return False - except (subprocess.CalledProcessError, FileNotFoundError): - print("amixer 命令未找到或执行失败") - return False - -if __name__ == "__main__": - print("树莓派音频播放功能测试") - print("=" * 40) - - success = True - - # 检查音量 - if not check_volume(): - success = False - - # 测试音频播放 - if not test_audio_playback(): - success = False - - print("\n" + "=" * 40) - if success: - print("✓ 所有音频播放测试通过") - sys.exit(0) - else: - print("✗ 部分音频播放测试失败") - sys.exit(1) \ No newline at end of file diff --git a/test_audio_recording.py b/test_audio_recording.py deleted file mode 100644 index e4e206d..0000000 --- a/test_audio_recording.py +++ /dev/null @@ -1,187 +0,0 @@ -#!/usr/bin/env python3 -""" -音频录音测试脚本 -用于测试树莓派的音频录音功能 -""" - -import subprocess -import time -import sys -import os -import signal - -def test_audio_recording(): - """测试音频录音功能""" - print("=== 音频录音测试 ===") - - # 检查录音设备 - print("\n1. 检查录音设备...") - try: - result = subprocess.run(['arecord', '-l'], capture_output=True, text=True) - if result.returncode == 0: - print("录音设备列表:") - print(result.stdout) - else: - print("错误: 无法获取录音设备列表") - return False - except FileNotFoundError: - print("错误: arecord 命令未找到,请安装 alsa-utils") - return False - - # 录制测试音频 - print("\n2. 录制测试音频(5秒)...") - test_record_file = "/tmp/test_record.wav" - - try: - print("请对着麦克风说话(5秒录音开始)...") - - # 录制5秒音频 - result = subprocess.run(['arecord', '-d', '5', '-f', 'cd', test_record_file], - capture_output=True, text=True) - - if result.returncode == 0: - print("✓ 音频录制成功") - - # 检查文件是否存在且大小合理 - if os.path.exists(test_record_file): - file_size = os.path.getsize(test_record_file) - print(f"录制文件大小: {file_size} 字节") - - if file_size > 1000: # 至少1KB - print("✓ 录音文件大小正常") - return True - else: - print("✗ 录音文件太小,可能录音失败") - return False - else: - print("✗ 录音文件未创建") - return False - else: - print("✗ 音频录制失败") - print(f"错误信息: {result.stderr}") - return False - - except FileNotFoundError: - print("错误: arecord 命令未找到") - return False - except KeyboardInterrupt: - print("\n录音被用户中断") - return False - -def test_audio_playback_verification(): - """播放录制的音频进行验证""" - print("\n3. 播放录制的音频进行验证...") - test_record_file = "/tmp/test_record.wav" - - if not os.path.exists(test_record_file): - print("错误: 找不到录制的音频文件") - return False - - try: - print("播放录制的音频...") - result = subprocess.run(['aplay', test_record_file], capture_output=True, text=True) - - if result.returncode == 0: - print("✓ 录音播放成功") - return True - else: - print("✗ 录音播放失败") - print(f"错误信息: {result.stderr}") - return False - - except FileNotFoundError: - print("错误: aplay 命令未找到") - return False - -def test_microphone_levels(): - """测试麦克风音量级别""" - print("\n4. 测试麦克风音量级别...") - - try: - # 获取麦克风音量 - result = subprocess.run(['amixer', 'sget', 'Capture'], capture_output=True, text=True) - - if result.returncode == 0: - print("当前麦克风音量:") - print(result.stdout) - - # 设置麦克风音量 - subprocess.run(['amixer', 'sset', 'Capture', '80%'], check=True) - print("✓ 麦克风音量已设置为80%") - return True - else: - print("无法获取麦克风音量信息") - return False - - except (subprocess.CalledProcessError, FileNotFoundError): - print("amixer 命令未找到或执行失败") - return False - -def test_realtime_monitoring(): - """实时音频监控测试""" - print("\n5. 实时音频监控测试(3秒)...") - - try: - print("开始实时监控,请对着麦克风说话...") - - # 使用parecord进行实时监控(如果可用) - cmd = ['parecord', '--monitor', '--latency-msec', '100', '--duration', '3', '/dev/null'] - - result = subprocess.run(cmd, capture_output=True, text=True, timeout=5) - - if result.returncode == 0: - print("✓ 实时监控测试成功") - return True - else: - print("提示: 实时监控测试跳过(需要pulseaudio)") - return True - - except (subprocess.TimeoutExpired, FileNotFoundError, subprocess.CalledProcessError): - print("提示: 实时监控测试跳过") - return True - -def cleanup(): - """清理测试文件""" - test_files = ["/tmp/test_record.wav"] - - for file_path in test_files: - if os.path.exists(file_path): - try: - os.remove(file_path) - print(f"✓ 已清理测试文件: {file_path}") - except OSError: - print(f"警告: 无法清理测试文件: {file_path}") - -if __name__ == "__main__": - print("树莓派音频录音功能测试") - print("=" * 40) - - success = True - - # 测试麦克风音量 - if not test_microphone_levels(): - success = False - - # 测试音频录制 - if not test_audio_recording(): - success = False - - # 播放录制的音频 - if os.path.exists("/tmp/test_record.wav"): - if not test_audio_playback_verification(): - success = False - - # 实时监控测试 - if not test_realtime_monitoring(): - success = False - - print("\n" + "=" * 40) - if success: - print("✓ 所有音频录音测试通过") - else: - print("✗ 部分音频录音测试失败") - - # 清理测试文件 - cleanup() - - sys.exit(0 if success else 1) \ No newline at end of file diff --git a/va_config.json b/va_config.json deleted file mode 100644 index ed9868d..0000000 --- a/va_config.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "volume": 10, - "mic_name": "bcm2835 Headphones: - (hw:0,0)", - "audio_output_device": "bcm2835 Headphones", - "model_name": "qwen2.5:0.5b", - "voice": "en_US-kathleen-low.onnx", - "enable_audio_processing": false, - "history_length": 6, - "system_prompt": "You are a helpful assistant." -} diff --git a/voice_assistant.py b/voice_assistant.py deleted file mode 100644 index fbd844f..0000000 --- a/voice_assistant.py +++ /dev/null @@ -1,469 +0,0 @@ -#!/usr/bin/env python3 -""" -Voice Assistant: Real-Time Voice Chat - -This app runs on a Raspberry Pi (or Linux desktop) and creates a low-latency, full-duplex voice interaction -with an AI character. It uses local speech recognition -(Vosk), local text-to-speech synthesis (Piper), and a locally hosted large language model via Ollama. - -Key Features: -- Wake-free, continuous voice recognition with real-time transcription -- LLM-driven responses streamed from a selected local model (e.g., LLaMA, Qwen, Gemma) -- Audio response synthesis with a gruff custom voice using ONNX-based Piper models -- Optional noise mixing and filtering via SoX -- System volume control via ALSA -- Modular and responsive design suitable for low-latency, character-driven agents - -Ideal for embedded voice AI demos, cosplay companions, or standalone AI characters. - -Copyright: M15.ai -License: MIT -""" - -import io -import json -import os -import queue -import re -import subprocess -import threading -import time -import wave - -import numpy as np -import pyaudio -import requests -import soxr -from pydub import AudioSegment -from vosk import KaldiRecognizer, Model - - -# ------------------- TIMING UTILITY ------------------- -class Timer: - def __init__(self, label): - self.label = label - self.enabled = True - def __enter__(self): - self.start = time.time() - return self - def __exit__(self, exc_type, exc_val, exc_tb): - if self.enabled: - elapsed_ms = (time.time() - self.start) * 1000 - print(f"[Timing] {self.label}: {elapsed_ms:.0f} ms") - def disable(self): - self.enabled = False - -# ------------------- FUNCTIONS ------------------- - -def get_input_device_index(preferred_name="Shure MVX2U"): - pa = pyaudio.PyAudio() - index = None - for i in range(pa.get_device_count()): - info = pa.get_device_info_by_index(i) - if preferred_name.lower() in info['name'].lower() and info['maxInputChannels'] > 0: - print(f"[Debug] Selected input device {i}: {info['name']}") - print(f"[Debug] Device sample rate: {info['defaultSampleRate']} Hz") - index = i - break - pa.terminate() - if index is None: - print("[Warning] Preferred mic not found. Falling back to default.") - return index - -def get_output_device_index(preferred_name): - pa = pyaudio.PyAudio() - for i in range(pa.get_device_count()): - info = pa.get_device_info_by_index(i) - if preferred_name.lower() in info['name'].lower() and info['maxOutputChannels'] > 0: - print(f"[Debug] Selected output device {i}: {info['name']}") - return i - print("[Warning] Preferred output device not found. Using default index 0.") - return 0 - -def parse_card_number(device_str): - """ - Extract ALSA card number from string like 'plughw:3,0' - """ - try: - return int(device_str.split(":")[1].split(",")[0]) - except Exception as e: - print(f"[Warning] Could not parse card number from {device_str}: {e}") - return 0 # fallback - -def list_input_devices(): - pa = pyaudio.PyAudio() - print("[Debug] Available input devices:") - for i in range(pa.get_device_count()): - info = pa.get_device_info_by_index(i) - if info['maxInputChannels'] > 0: - print(f" {i}: {info['name']} ({int(info['defaultSampleRate'])} Hz, {info['maxInputChannels']}ch)") - pa.terminate() - -def resample_audio(data, orig_rate=48000, target_rate=16000): - # Convert byte string to numpy array - audio_np = np.frombuffer(data, dtype=np.int16) - # Resample using soxr - resampled_np = soxr.resample(audio_np, orig_rate, target_rate) - # Convert back to bytes - return resampled_np.astype(np.int16).tobytes() - -def set_output_volume(volume_level, card_id=3): - """ - Set output volume using ALSA 'Speaker' control on specified card. - volume_level: 1–10 (user scale) - card_id: ALSA card number (from aplay -l) - """ - percent = max(1, min(volume_level, 10)) * 10 # map to 10–100% - try: - subprocess.run( - ['amixer', '-c', str(card_id), 'sset', 'Speaker', f'{percent}%'], - check=True, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL - ) - print(f"[Debug] Volume set to {percent}% on card {card_id}") - except Exception as e: - print(f"[Warning] Volume control failed on card {card_id}: {e}") - -# ------------------- PATHS ------------------- - -CONFIG_PATH = os.path.expanduser("va_config.json") -BASE_DIR = os.path.dirname(__file__) -MODEL_PATH = os.path.join(BASE_DIR, 'vosk-model') -CHAT_URL = 'https://open.bigmodel.cn/api/paas/v4/chat/completions' -AUTH_TOKEN = '0c9cbaca9d2bbf864990f1e1decdf340.dXRMsZCHTUbPQ0rm' # Replace with your actual token - -# ------------------- CONFIG FILE LOADING ------------------- - -DEFAULT_CONFIG = { - "volume": 9, - "mic_name": "Plantronics", - "audio_output_device": "Plantronics", - "model_name": "qwen2.5:0.5b", - "voice": "en_US-kathleen-low.onnx", - "enable_audio_processing": False, - "history_length": 4, - "system_prompt": "You are a helpful assistant." -} - -def load_config(): - # Load config from system file or fall back to defaults - if os.path.isfile(CONFIG_PATH): - try: - with open(CONFIG_PATH, 'r') as f: - user_config = json.load(f) - return {**DEFAULT_CONFIG, **user_config} # merge with defaults - except Exception as e: - print(f"[Warning] Failed to load system config: {e}") - - print("[Debug] Using default config.") - - return DEFAULT_CONFIG - -config = load_config() - -# Apply loaded config values -VOLUME = config["volume"] -MIC_NAME = config["mic_name"] -AUDIO_OUTPUT_DEVICE = config["audio_output_device"] -AUDIO_OUTPUT_DEVICE_INDEX = get_output_device_index(config["audio_output_device"]) -OUTPUT_CARD = parse_card_number(AUDIO_OUTPUT_DEVICE) -MODEL_NAME = config["model_name"] -VOICE_MODEL = os.path.join("voices", config["voice"]) -ENABLE_AUDIO_PROCESSING = config["enable_audio_processing"] -HISTORY_LENGTH = config["history_length"] - -# Set system volume -set_output_volume(VOLUME, OUTPUT_CARD) - -# Setup messages with system prompt -messages = [{"role": "system", "content": config["system_prompt"]}] - -list_input_devices() -RATE = 48000 -CHUNK = 1024 -CHANNELS = 1 -mic_enabled = True -DEVICE_INDEX = get_input_device_index() - -# SOUND EFFECTS -NOISE_LEVEL = '0.04' -BANDPASS_HIGHPASS = '300' -BANDPASS_LOWPASS = '800' - -# ------------------- VOICE MODEL ------------------- - -VOICE_MODELS_DIR = os.path.join(BASE_DIR, 'voices') -if not os.path.isdir(VOICE_MODELS_DIR): - os.makedirs(VOICE_MODELS_DIR) - -VOICE_MODEL = os.path.join(VOICE_MODELS_DIR, config["voice"]) - -print('[Debug] Available Piper voices:') -for f in os.listdir(VOICE_MODELS_DIR): - if f.endswith('.onnx'): - print(' ', f) -print(f'[Debug] Using VOICE_MODEL: {VOICE_MODEL}') -print(f"[Debug] Config loaded: model={MODEL_NAME}, voice={config['voice']}, vol={VOLUME}, mic={MIC_NAME}") - -# ------------------- CONVERSATION STATE ------------------- - -audio_queue = queue.Queue() - -# Audio callback form Shure -def audio_callback(in_data, frame_count, time_info, status): - global mic_enabled - if not mic_enabled: - return (None, pyaudio.paContinue) - resampled_data = resample_audio(in_data, orig_rate=48000, target_rate=16000) - audio_queue.put(resampled_data) - return (None, pyaudio.paContinue) - -# ------------------- STREAM SETUP ------------------- - -def start_stream(): - pa = pyaudio.PyAudio() - - stream = pa.open( - rate=RATE, - format=pyaudio.paInt16, - channels=CHANNELS, - input=True, - input_device_index=DEVICE_INDEX, - frames_per_buffer=CHUNK, - stream_callback=audio_callback - ) - stream.start_stream() - print(f'[Debug] Stream @ {RATE}Hz') - return pa, stream - -# ------------------- QUERY OLLAMA CHAT ENDPOINT ------------------- - -def query_glm(): - headers = { - 'Authorization': f'Bearer {AUTH_TOKEN}', - 'Content-Type': 'application/json' - } - payload = { - "model": "glm-4.5", - "messages": [messages[0]] + messages[-HISTORY_LENGTH:], # force system prompt at top - "temperature": 0.6, - "max_tokens": 1024, - "stream": False - } - - with Timer("Inference"): # measure inference latency - resp = requests.post(CHAT_URL, json=payload, headers=headers) - - if resp.status_code != 200: - print(f'[Error] GLM API failed with status {resp.status_code}: {resp.text}') - return '' - - data = resp.json() - # Extract assistant message - reply = '' - if 'choices' in data and len(data['choices']) > 0: - choice = data['choices'][0] - if 'message' in choice and 'content' in choice['message']: - reply = choice['message']['content'].strip() - return reply - -# ------------------- TTS & DEGRADATION ------------------- - -import tempfile - - -def play_response(text): - import io - import tempfile - - # Mute the mic during playback to avoid feedback loop - global mic_enabled - mic_enabled = False # 🔇 mute mic - - # clean the response - clean = re.sub(r"[\*]+", '', text) # remove asterisks - clean = re.sub(r"\(.*?\)", '', clean) # remove (stage directions) - clean = re.sub(r"<.*?>", '', clean) # remove HTML-style tags - clean = clean.replace('\n', ' ').strip() # normalize newlines - clean = re.sub(r'\s+', ' ', clean) # collapse whitespace - clean = re.sub(r'[\U0001F300-\U0001FAFF\u2600-\u26FF\u2700-\u27BF]+', '', clean) # remove emojis - - piper_path = os.path.join(BASE_DIR, 'bin', 'piper', 'piper') - - # 1. Generate Piper raw PCM - with Timer("Piper inference"): - piper_proc = subprocess.Popen( - [piper_path, '--model', VOICE_MODEL, '--output_raw'], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL - ) - tts_pcm, _ = piper_proc.communicate(input=clean.encode()) - - if ENABLE_AUDIO_PROCESSING: - # SoX timing consolidation - sox_start = time.time() - - # 2. Convert raw PCM to WAV - pcm_to_wav = subprocess.Popen( - ['sox', '-t', 'raw', '-r', '16000', '-c', str(CHANNELS), '-b', '16', - '-e', 'signed-integer', '-', '-t', 'wav', '-'], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL - ) - tts_wav_16k, _ = pcm_to_wav.communicate(input=tts_pcm) - - # 3. Estimate duration - duration_sec = len(tts_pcm) / (RATE * 2) - - # 4. Generate white noise WAV bytes - noise_bytes = subprocess.check_output([ - 'sox', '-n', - '-r', '16000', - '-c', str(CHANNELS), - '-b', '16', - '-e', 'signed-integer', - '-t', 'wav', '-', - 'synth', str(duration_sec), - 'whitenoise', 'vol', NOISE_LEVEL - ], stderr=subprocess.DEVNULL) - - # 5. Write both to temp files & mix - with tempfile.NamedTemporaryFile(suffix='.wav') as tts_file, tempfile.NamedTemporaryFile(suffix='.wav') as noise_file: - tts_file.write(tts_wav_16k) - noise_file.write(noise_bytes) - tts_file.flush() - noise_file.flush() - mixer = subprocess.Popen( - ['sox', '-m', tts_file.name, noise_file.name, '-t', 'wav', '-'], - stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL - ) - mixed_bytes, _ = mixer.communicate() - - # 6. Apply filter - filter_proc = subprocess.Popen( - #['sox', '-t', 'wav', '-', '-t', 'wav', '-', 'highpass', BANDPASS_HIGHPASS, 'lowpass', BANDPASS_LOWPASS], - ['sox', '-t', 'wav', '-', '-r', '48000', '-t', 'wav', '-', - 'highpass', BANDPASS_HIGHPASS, 'lowpass', BANDPASS_LOWPASS], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL - ) - final_bytes, _ = filter_proc.communicate(input=mixed_bytes) - - sox_elapsed = (time.time() - sox_start) * 1000 - print(f"[Timing] SoX (total): {int(sox_elapsed)} ms") - - else: - # No FX: just convert raw PCM to WAV - pcm_to_wav = subprocess.Popen( - ['sox', '-t', 'raw', '-r', '16000', '-c', str(CHANNELS), '-b', '16', - '-e', 'signed-integer', '-', '-t', 'wav', '-'], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL - ) - tts_wav_16k, _ = pcm_to_wav.communicate(input=tts_pcm) - - resample_proc = subprocess.Popen( - ['sox', '-t', 'wav', '-', '-r', '48000', '-t', 'wav', '-'], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.DEVNULL - ) - final_bytes, _ = resample_proc.communicate(input=tts_wav_16k) - - # 7. Playback - with Timer("Playback"): - try: - wf = wave.open(io.BytesIO(final_bytes), 'rb') - - - pa = pyaudio.PyAudio() - stream = pa.open( - format=pa.get_format_from_width(wf.getsampwidth()), - channels=wf.getnchannels(), - rate=wf.getframerate(), - output=True, - output_device_index=AUDIO_OUTPUT_DEVICE_INDEX - ) - - data = wf.readframes(CHUNK) - while data: - stream.write(data) - data = wf.readframes(CHUNK) - - stream.stop_stream() - stream.close() - pa.terminate() - wf.close() - - except wave.Error as e: - print(f"[Error] Could not open final WAV: {e}") - - finally: - mic_enabled = True # 🔊 unmute mic - time.sleep(0.3) # optional: small cooldown - - -# ------------------- PROCESSING LOOP ------------------- - -def processing_loop(): - model = Model(MODEL_PATH) - #rec = KaldiRecognizer(model, RATE) - rec = KaldiRecognizer(model, 16000) - MAX_DEBUG_LEN = 200 # optional: limit length of debug output - LOW_EFFORT_UTTERANCES = {"huh", "uh", "um", "erm", "hmm", "he's", "but"} - - while True: - data = audio_queue.get() - - if rec.AcceptWaveform(data): - start = time.time() - r = json.loads(rec.Result()) - elapsed_ms = int((time.time() - start) * 1000) - - user = r.get('text', '').strip() - if user: - print(f"[Timing] STT parse: {elapsed_ms} ms") - print("User:", user) - - if user.lower().strip(".,!? ") in LOW_EFFORT_UTTERANCES: - print("[Debug] Ignored low-effort utterance.") - rec = KaldiRecognizer(model, 16000) - continue # Skip LLM response + TTS for accidental noise - - messages.append({"role": "user", "content": user}) - # Generate assistant response - resp_text = query_glm() - if resp_text: - # Clean debug print (remove newlines and carriage returns) - clean_debug_text = resp_text.replace('\n', ' ').replace('\r', ' ') - if len(clean_debug_text) > MAX_DEBUG_LEN: - clean_debug_text = clean_debug_text[:MAX_DEBUG_LEN] + '...' - - print('Assistant:', clean_debug_text) - messages.append({"role": "assistant", "content": clean_debug_text}) - - # TTS generation + playback - play_response(resp_text) - else: - print('[Debug] Empty response, skipping TTS.') - - # Reset recognizer after each full interaction - rec = KaldiRecognizer(model, 16000) - -# ------------------- MAIN ------------------- - -if __name__ == '__main__': - pa, stream = start_stream() - t = threading.Thread(target=processing_loop, daemon=True) - t.start() - try: - while stream.is_active(): - time.sleep(0.1) - except KeyboardInterrupt: - stream.stop_stream(); stream.close(); pa.terminate() diff --git a/voice_recorder.py b/voice_recorder.py new file mode 100644 index 0000000..b5b943f --- /dev/null +++ b/voice_recorder.py @@ -0,0 +1,344 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +语音录制模块 +基于pyaudio实现,支持语音活动检测(VAD)自动判断录音结束 +""" + +import pyaudio +import wave +import numpy as np +import time +import os +import threading +from collections import deque + +class VoiceRecorder: + """语音录制器,支持自动检测语音结束""" + + def __init__(self, + energy_threshold=500, + silence_threshold=1.0, + min_recording_time=0.5, + max_recording_time=10.0, + sample_rate=16000, + chunk_size=1024, + defer_audio_init=False): + """ + 初始化录音器 + + Args: + energy_threshold: 语音能量阈值 + silence_threshold: 静音持续时间阈值(秒) + min_recording_time: 最小录音时间(秒) + max_recording_time: 最大录音时间(秒) + sample_rate: 采样率 + chunk_size: 音频块大小 + defer_audio_init: 是否延迟音频初始化 + """ + self.energy_threshold = energy_threshold + self.silence_threshold = silence_threshold + self.min_recording_time = min_recording_time + self.max_recording_time = max_recording_time + self.sample_rate = sample_rate + self.chunk_size = chunk_size + self.defer_audio_init = defer_audio_init + + # 音频参数 + self.FORMAT = pyaudio.paInt16 + self.CHANNELS = 1 + + # 状态变量 + self.audio = None + self.stream = None + self.recording = False + self.recorded_frames = [] + + # 语音检测相关 + self.silence_start_time = None + self.recording_start_time = None + self.audio_buffer = deque(maxlen=int(sample_rate / chunk_size * 2)) # 2秒缓冲 + + # 回调函数 + self.on_recording_complete = None + self.on_speech_detected = None + + if not defer_audio_init: + self._setup_audio() + + def _setup_audio(self): + """设置音频设备""" + try: + self.audio = pyaudio.PyAudio() + + # 获取默认输入设备信息 + device_info = self.audio.get_default_input_device_info() + print(f"使用音频设备: {device_info['name']}") + + except Exception as e: + print(f"音频设备初始化失败: {e}") + raise + + def _calculate_energy(self, audio_data): + """计算音频能量""" + if len(audio_data) == 0: + return 0 + + # 转换为numpy数组 + audio_array = np.frombuffer(audio_data, dtype=np.int16) + + # 计算RMS能量 + rms = np.sqrt(np.mean(audio_array ** 2)) + return rms + + def _is_speech(self, audio_data): + """判断是否为语音""" + energy = self._calculate_energy(audio_data) + return energy > self.energy_threshold + + def _open_stream(self): + """打开音频流""" + if self.stream is not None: + return + + self.stream = self.audio.open( + format=self.FORMAT, + channels=self.CHANNELS, + rate=self.sample_rate, + input=True, + frames_per_buffer=self.chunk_size + ) + + def _close_stream(self): + """关闭音频流""" + if self.stream: + self.stream.stop_stream() + self.stream.close() + self.stream = None + + def start_listening(self): + """开始监听语音""" + if self.recording: + print("正在录音中...") + return + + self._open_stream() + self.recording = True + self.recorded_frames = [] + self.silence_start_time = None + self.recording_start_time = None + + print("开始监听语音...") + + # 在新线程中录音 + recording_thread = threading.Thread(target=self._record_loop) + recording_thread.daemon = True + recording_thread.start() + + def _record_loop(self): + """录音循环""" + try: + while self.recording: + # 读取音频数据 + data = self.stream.read(self.chunk_size, exception_on_overflow=False) + + if len(data) == 0: + continue + + # 计算能量 + energy = self._calculate_energy(data) + + # 添加到缓冲区 + self.audio_buffer.append(data) + + # 检测语音活动 + if energy > self.energy_threshold: + # 检测到语音 + if self.recording_start_time is None: + # 开始录音 + self.recording_start_time = time.time() + self.silence_start_time = None + self.recorded_frames = list(self.audio_buffer) # 包含之前的音频 + + print("🎤 检测到语音,开始录音...") + + if self.on_speech_detected: + self.on_speech_detected() + + # 重置静音计时 + self.silence_start_time = None + + # 录音 + self.recorded_frames.append(data) + + elif self.recording_start_time is not None: + # 之前有语音,现在检查是否静音 + if self.silence_start_time is None: + self.silence_start_time = time.time() + + # 继续录音 + self.recorded_frames.append(data) + + # 检查是否静音超时 + silence_duration = time.time() - self.silence_start_time + if silence_duration > self.silence_threshold: + recording_duration = time.time() - self.recording_start_time + + # 检查最小录音时间 + if recording_duration >= self.min_recording_time: + print(f"静音 {silence_duration:.1f}s,结束录音") + self.stop_recording() + break + else: + print(f"录音时间太短 ({recording_duration:.1f}s),继续等待...") + self.silence_start_time = time.time() + + # 检查最大录音时间 + if self.recording_start_time is not None: + recording_duration = time.time() - self.recording_start_time + if recording_duration > self.max_recording_time: + print(f"达到最大录音时间 {self.max_recording_time}s,结束录音") + self.stop_recording() + break + + # 短暂休眠 + time.sleep(0.01) + + except Exception as e: + print(f"录音过程中发生错误: {e}") + self.stop_recording() + + def stop_recording(self): + """停止录音""" + if not self.recording: + return + + self.recording = False + self._close_stream() + + if len(self.recorded_frames) > 0: + # 保存录音 + audio_data = b''.join(self.recorded_frames) + + print(f"录音完成,共 {len(self.recorded_frames)} 帧") + print(f"录音时长: {len(audio_data) / (self.sample_rate * 2):.2f} 秒") + + # 调用回调函数 + if self.on_recording_complete: + self.on_recording_complete(audio_data) + + # 重置状态 + self.recorded_frames = [] + self.silence_start_time = None + self.recording_start_time = None + + def save_audio(self, audio_data, filename): + """保存音频到文件""" + try: + with wave.open(filename, 'wb') as wf: + wf.setnchannels(self.CHANNELS) + wf.setsampwidth(self.audio.get_sample_size(self.FORMAT)) + wf.setframerate(self.sample_rate) + wf.writeframes(audio_data) + + print(f"音频已保存到: {filename}") + return True + except Exception as e: + print(f"保存音频失败: {e}") + return False + + def set_recording_complete_callback(self, callback): + """设置录音完成回调函数""" + self.on_recording_complete = callback + + def set_speech_detected_callback(self, callback): + """设置语音检测回调函数""" + self.on_speech_detected = callback + + def adjust_sensitivity(self, energy_threshold=None, silence_threshold=None): + """调整灵敏度""" + if energy_threshold is not None: + self.energy_threshold = energy_threshold + print(f"能量阈值调整为: {energy_threshold}") + + if silence_threshold is not None: + self.silence_threshold = silence_threshold + print(f"静音阈值调整为: {silence_threshold}秒") + + def get_audio_level(self): + """获取当前音频级别""" + if len(self.audio_buffer) > 0: + latest_data = self.audio_buffer[-1] + return self._calculate_energy(latest_data) + return 0 + + def cleanup(self): + """清理资源""" + self.stop_recording() + if self.audio: + self.audio.terminate() + self.audio = None + +def main(): + """测试录音功能""" + print("🎙️ 语音录制测试") + print("=" * 50) + print("配置:") + print("- 能量阈值: 500") + print("- 静音阈值: 1.0秒") + print("- 最小录音时间: 0.5秒") + print("- 最大录音时间: 10秒") + print("=" * 50) + print("请说话测试录音功能...") + print("按 Ctrl+C 退出") + + def on_recording_complete(audio_data): + """录音完成回调""" + # 保存录音文件 + timestamp = time.strftime("%Y%m%d_%H%M%S") + filename = f"recording_{timestamp}.wav" + + recorder.save_audio(audio_data, filename) + print(f"✅ 录音文件已保存: {filename}") + + # 显示录音信息 + duration = len(audio_data) / (recorder.sample_rate * 2) + print(f"录音时长: {duration:.2f} 秒") + + def on_speech_detected(): + """检测到语音回调""" + print("🔊 检测到语音活动...") + + # 创建录音器 + recorder = VoiceRecorder( + energy_threshold=500, + silence_threshold=1.0, + min_recording_time=0.5, + max_recording_time=10.0 + ) + + # 设置回调 + recorder.set_recording_complete_callback(on_recording_complete) + recorder.set_speech_detected_callback(on_speech_detected) + + try: + # 开始监听 + recorder.start_listening() + + # 保持程序运行 + while True: + time.sleep(0.1) + + # 显示当前音频级别(可选) + level = recorder.get_audio_level() + if level > 100: + print(f"当前音频级别: {level:.0f}", end='\r') + + except KeyboardInterrupt: + print("\n👋 退出录音测试") + finally: + recorder.cleanup() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/vosk-model/.DS_Store b/vosk-model/.DS_Store deleted file mode 100644 index 96d2d27..0000000 Binary files a/vosk-model/.DS_Store and /dev/null differ