feat: Qwen2.5-omni-7b full modal speech recognition

This commit is contained in:
zhangzhanwei 2025-08-18 11:01:54 +08:00 committed by zhanweizhang7
parent a85c36f289
commit b32b06391f
4 changed files with 15 additions and 11 deletions

View File

@ -15,7 +15,7 @@ from models_provider.impl.aliyun_bai_lian_model_provider.credential.embedding im
AliyunBaiLianEmbeddingCredential AliyunBaiLianEmbeddingCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.image import QwenVLModelCredential from models_provider.impl.aliyun_bai_lian_model_provider.credential.image import QwenVLModelCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.llm import BaiLianLLMModelCredential from models_provider.impl.aliyun_bai_lian_model_provider.credential.llm import BaiLianLLMModelCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.omi_stt import AliyunBaiLianOmiSTTModelCredential from models_provider.impl.aliyun_bai_lian_model_provider.credential.omni_stt import AliyunBaiLianOmiSTTModelCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.reranker import \ from models_provider.impl.aliyun_bai_lian_model_provider.credential.reranker import \
AliyunBaiLianRerankerCredential AliyunBaiLianRerankerCredential
from models_provider.impl.aliyun_bai_lian_model_provider.credential.stt import AliyunBaiLianSTTModelCredential from models_provider.impl.aliyun_bai_lian_model_provider.credential.stt import AliyunBaiLianSTTModelCredential
@ -24,7 +24,7 @@ from models_provider.impl.aliyun_bai_lian_model_provider.credential.tts import A
from models_provider.impl.aliyun_bai_lian_model_provider.model.embedding import AliyunBaiLianEmbedding from models_provider.impl.aliyun_bai_lian_model_provider.model.embedding import AliyunBaiLianEmbedding
from models_provider.impl.aliyun_bai_lian_model_provider.model.image import QwenVLChatModel from models_provider.impl.aliyun_bai_lian_model_provider.model.image import QwenVLChatModel
from models_provider.impl.aliyun_bai_lian_model_provider.model.llm import BaiLianChatModel from models_provider.impl.aliyun_bai_lian_model_provider.model.llm import BaiLianChatModel
from models_provider.impl.aliyun_bai_lian_model_provider.model.omi_stt import AliyunBaiLianOmiSpeechToText from models_provider.impl.aliyun_bai_lian_model_provider.model.omni_stt import AliyunBaiLianOmiSpeechToText
from models_provider.impl.aliyun_bai_lian_model_provider.model.reranker import AliyunBaiLianReranker from models_provider.impl.aliyun_bai_lian_model_provider.model.reranker import AliyunBaiLianReranker
from models_provider.impl.aliyun_bai_lian_model_provider.model.stt import AliyunBaiLianSpeechToText from models_provider.impl.aliyun_bai_lian_model_provider.model.stt import AliyunBaiLianSpeechToText
from models_provider.impl.aliyun_bai_lian_model_provider.model.tti import QwenTextToImageModel from models_provider.impl.aliyun_bai_lian_model_provider.model.tti import QwenTextToImageModel
@ -80,6 +80,9 @@ model_info_list = [ModelInfo('gte-rerank',
ModelInfo('qwen-omni-turbo', ModelInfo('qwen-omni-turbo',
_('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'), _('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'),
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText), ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText),
ModelInfo('qwen2.5-omni-7b',
_('The Qwen Omni series model supports inputting multiple modalities of data, including video, audio, images, and text, and outputting audio and text.'),
ModelTypeConst.STT, aliyun_bai_lian_omi_stt_model_credential, AliyunBaiLianOmiSpeechToText),
] ]
module_info_vl_list = [ module_info_vl_list = [

View File

@ -17,7 +17,8 @@ class AliyunBaiLianOmiSTTModelParams(BaseForm):
class AliyunBaiLianOmiSTTModelCredential(BaseForm, BaseModelCredential): class AliyunBaiLianOmiSTTModelCredential(BaseForm, BaseModelCredential):
api_key = PasswordInputField("API key", required=True) api_url = forms.TextInputField(_('API URL'), required=True)
api_key = forms.PasswordInputField(_('API Key'), required=True)
def is_valid(self, def is_valid(self,
model_type: str, model_type: str,

View File

@ -12,6 +12,7 @@ from models_provider.impl.base_stt import BaseSpeechToText
class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText): class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText):
api_key: str api_key: str
api_url: str
model: str model: str
params: dict params: dict
@ -20,6 +21,7 @@ class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText):
self.api_key = kwargs.get('api_key') self.api_key = kwargs.get('api_key')
self.model = kwargs.get('model') self.model = kwargs.get('model')
self.params = kwargs.get('params') self.params = kwargs.get('params')
self.api_url = kwargs.get('api_url')
@staticmethod @staticmethod
def is_cache_model(): def is_cache_model():
@ -30,6 +32,7 @@ class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText):
return AliyunBaiLianOmiSpeechToText( return AliyunBaiLianOmiSpeechToText(
model=model_name, model=model_name,
api_key=model_credential.get('api_key'), api_key=model_credential.get('api_key'),
api_url=model_credential.get('api_url') ,
params= model_kwargs, params= model_kwargs,
**model_kwargs **model_kwargs
) )
@ -47,13 +50,13 @@ class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText):
client = OpenAI( client = OpenAI(
# 若没有配置环境变量请用阿里云百炼API Key将下行替换为api_key="sk-xxx", # 若没有配置环境变量请用阿里云百炼API Key将下行替换为api_key="sk-xxx",
api_key=self.api_key, api_key=self.api_key,
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1", base_url=self.api_url,
) )
base64_audio = base64.b64encode(audio_file.read()).decode("utf-8") base64_audio = base64.b64encode(audio_file.read()).decode("utf-8")
completion = client.chat.completions.create( completion = client.chat.completions.create(
model="qwen-omni-turbo-0119", model=self.model,
messages=[ messages=[
{ {
"role": "user", "role": "user",
@ -71,16 +74,15 @@ class AliyunBaiLianOmiSpeechToText(MaxKBBaseModel, BaseSpeechToText):
], ],
# 设置输出数据的模态,当前支持两种:["text","audio"]、["text"] # 设置输出数据的模态,当前支持两种:["text","audio"]、["text"]
modalities=["text"], modalities=["text"],
audio={"voice": "Cherry", "format": "mp3"},
# stream 必须设置为 True否则会报错 # stream 必须设置为 True否则会报错
stream=True, stream=True,
stream_options={"include_usage": True}, stream_options={"include_usage": True},
) )
result = [] result = []
for chunk in completion: for chunk in completion:
if chunk.choices and hasattr(chunk.choices[0].delta, 'audio'): if chunk.choices and hasattr(chunk.choices[0].delta, 'content'):
transcript = chunk.choices[0].delta.audio.get('transcript') content = chunk.choices[0].delta.content
result.append(transcript) result.append(content)
return "".join(result) return "".join(result)
except Exception as err: except Exception as err:

View File

@ -30,8 +30,6 @@ class AliyunBaiLianSpeechToText(MaxKBBaseModel, BaseSpeechToText):
optional_params['max_tokens'] = model_kwargs['max_tokens'] optional_params['max_tokens'] = model_kwargs['max_tokens']
if 'temperature' in model_kwargs and model_kwargs['temperature'] is not None: if 'temperature' in model_kwargs and model_kwargs['temperature'] is not None:
optional_params['temperature'] = model_kwargs['temperature'] optional_params['temperature'] = model_kwargs['temperature']
if model_name == 'qwen-omni-turbo':
optional_params['streaming'] = True
return AliyunBaiLianSpeechToText( return AliyunBaiLianSpeechToText(
model=model_name, model=model_name,
api_key=model_credential.get('api_key'), api_key=model_credential.get('api_key'),