feat: Vllm whisper model

2025-08-20 18:24:21 +08:00 · 2025-08-20 18:24:21 +08:00 · c8ec7c5558
commit c8ec7c5558
parent 27aeba47c4
7 changed files with 158 additions and 6 deletions
--- a/apps/locales/en_US/LC_MESSAGES/django.po
+++ b/apps/locales/en_US/LC_MESSAGES/django.po
@ -8663,4 +8663,7 @@ msgid "resource authorization"
 msgstr ""
 msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition."
 msgstr ""
 msgid "If not passed, the default value is 'zh'"
 msgstr ""
--- a/apps/locales/zh_CN/LC_MESSAGES/django.po
+++ b/apps/locales/zh_CN/LC_MESSAGES/django.po
@ -8789,4 +8789,7 @@ msgid "resource authorization"
 msgstr "资源授权"
 msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition."
-msgstr "基于Qwen-Audio的端到端语音识别大模型，支持3分钟以内的音频识别，目前主要支持中英文识别。"
+msgstr "基于Qwen-Audio的端到端语音识别大模型，支持3分钟以内的音频识别，目前主要支持中英文识别。"
 msgid "If not passed, the default value is 'zh'"
 msgstr "如果未传递，则默认值为'zh'"
--- a/apps/locales/zh_Hant/LC_MESSAGES/django.po
+++ b/apps/locales/zh_Hant/LC_MESSAGES/django.po
@ -8789,4 +8789,7 @@ msgid "resource authorization"
 msgstr "資源授權"
 msgid "The Qwen Audio based end-to-end speech recognition model supports audio recognition within 3 minutes. At present, it mainly supports Chinese and English recognition."
-msgstr "基於Qwen-Audio的端到端語音辨識大模型，支持3分鐘以內的音訊識別，現時主要支持中英文識別。"
+msgstr "基於Qwen-Audio的端到端語音辨識大模型，支持3分鐘以內的音訊識別，現時主要支持中英文識別。"
 msgid "If not passed, the default value is 'zh'"
 msgstr "如果未傳遞，則預設值為'zh'"
--- a/apps/models_provider/impl/vllm_model_provider/credential/whisper_stt.py
+++ b/apps/models_provider/impl/vllm_model_provider/credential/whisper_stt.py
@ -0,0 +1,62 @@
 # coding=utf-8
 import traceback
 from typing import Dict
 from django.utils.translation import gettext_lazy as _, gettext
 from langchain_core.messages import HumanMessage
 from common import forms
 from common.exception.app_exception import AppApiException
 from common.forms import BaseForm, TooltipLabel
 from models_provider.base_model_provider import BaseModelCredential, ValidCode
 class VLLMWhisperModelParams(BaseForm):
    Language = forms.TextInputField(
        TooltipLabel(_('Language'),
                     _("If not passed, the default value is 'zh'")),
        required=True,
        default_value='zh',
    )
 class VLLMWhisperModelCredential(BaseForm, BaseModelCredential):
    api_url = forms.TextInputField('API URL', required=True)
    api_key = forms.PasswordInputField('API Key', required=True)
    def is_valid(self,
                 model_type: str,
                 model_name,
                 model_credential: Dict[str, object],
                 model_params,
                 provider,
                 raise_exception=False):
        model_type_list = provider.get_model_type_list()
        if not any(list(filter(lambda mt: mt.get('value') == model_type, model_type_list))):
            raise AppApiException(ValidCode.valid_error.value,
                                  gettext('{model_type} Model type is not supported').format(model_type=model_type))
        try:
            model_list = provider.get_base_model_list(model_credential.get('api_url'), model_credential.get('api_key'))
        except Exception as e:
            raise AppApiException(ValidCode.valid_error.value, gettext('API domain name is invalid'))
        exist = provider.get_model_info_by_name(model_list, model_name)
        if len(exist) == 0:
            raise AppApiException(ValidCode.valid_error.value,
                                  gettext('The model does not exist, please download the model first'))
        model = provider.get_model(model_type, model_name, model_credential, **model_params)
        return True
    def encryption_dict(self, model_info: Dict[str, object]):
        return {**model_info, 'api_key': super().encryption(model_info.get('api_key', ''))}
    def build_model(self, model_info: Dict[str, object]):
        for key in ['api_key', 'model']:
            if key not in model_info:
                raise AppApiException(500, gettext('{key}  is required').format(key=key))
        self.api_key = model_info.get('api_key')
        return self
    def get_model_params_setting_form(self, model_name):
        return VLLMWhisperModelParams()
--- a/apps/models_provider/impl/vllm_model_provider/model/iat_mp3_16k.mp3
+++ b/apps/models_provider/impl/vllm_model_provider/model/iat_mp3_16k.mp3
--- a/apps/models_provider/impl/vllm_model_provider/model/whisper_sst.py
+++ b/apps/models_provider/impl/vllm_model_provider/model/whisper_sst.py
@ -0,0 +1,64 @@
 import base64
 import os
 import traceback
 from typing import Dict
 from openai import OpenAI
 from common.utils.logger import maxkb_logger
 from models_provider.base_model_provider import MaxKBBaseModel
 from models_provider.impl.base_stt import BaseSpeechToText
 class VllmWhisperSpeechToText(MaxKBBaseModel, BaseSpeechToText):
    api_key: str
    api_url: str
    model: str
    params: dict
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.api_key = kwargs.get('api_key')
        self.model = kwargs.get('model')
        self.params = kwargs.get('params')
        self.api_url = kwargs.get('api_url')
    @staticmethod
    def is_cache_model():
        return False
    @staticmethod
    def new_instance(model_type, model_name, model_credential: Dict[str, object], **model_kwargs):
        return VllmWhisperSpeechToText(
            model=model_name,
            api_key=model_credential.get('api_key'),
            api_url=model_credential.get('api_url'),
            params=model_kwargs,
            **model_kwargs
        )
    def check_auth(self):
        cwd = os.path.dirname(os.path.abspath(__file__))
        with open(f'{cwd}/iat_mp3_16k.mp3', 'rb') as audio_file:
            self.speech_to_text(audio_file)
    def speech_to_text(self, audio_file):
        base_url = f"{self.api_url}/v1"
        try:
            client = OpenAI(
                api_key=self.api_key,
                base_url=base_url
            )
            result = client.audio.transcriptions.create(
                file=audio_file,
                model=self.model,
                language=self.params.get('Language'),
                response_format="json"
            )
            return result.text
        except Exception as err:
            maxkb_logger.error(f":Error: {str(err)}: {traceback.format_exc()}")
--- a/apps/models_provider/impl/vllm_model_provider/vllm_model_provider.py
+++ b/apps/models_provider/impl/vllm_model_provider/vllm_model_provider.py
@ -10,20 +10,27 @@ from models_provider.base_model_provider import IModelProvider, ModelProvideInfo
 from models_provider.impl.vllm_model_provider.credential.embedding import VllmEmbeddingCredential
 from models_provider.impl.vllm_model_provider.credential.image import VllmImageModelCredential
 from models_provider.impl.vllm_model_provider.credential.llm import VLLMModelCredential
 from models_provider.impl.vllm_model_provider.credential.whisper_stt import VLLMWhisperModelCredential
 from models_provider.impl.vllm_model_provider.model.embedding import VllmEmbeddingModel
 from models_provider.impl.vllm_model_provider.model.image import VllmImage
 from models_provider.impl.vllm_model_provider.model.llm import VllmChatModel
 from maxkb.conf import PROJECT_DIR
 from django.utils.translation import gettext as _
 from models_provider.impl.vllm_model_provider.model.whisper_sst import VllmWhisperSpeechToText
 v_llm_model_credential = VLLMModelCredential()
 image_model_credential = VllmImageModelCredential()
 embedding_model_credential = VllmEmbeddingCredential()
 whisper_model_credential = VLLMWhisperModelCredential()
 model_info_list = [
-    ModelInfo('facebook/opt-125m', _('Facebook’s 125M parameter model'), ModelTypeConst.LLM, v_llm_model_credential, VllmChatModel),
+    ModelInfo('facebook/opt-125m', _('Facebook’s 125M parameter model'), ModelTypeConst.LLM, v_llm_model_credential,
-    ModelInfo('BAAI/Aquila-7B', _('BAAI’s 7B parameter model'), ModelTypeConst.LLM, v_llm_model_credential, VllmChatModel),
+              VllmChatModel),
-    ModelInfo('BAAI/AquilaChat-7B', _('BAAI’s 13B parameter mode'), ModelTypeConst.LLM, v_llm_model_credential, VllmChatModel),
+    ModelInfo('BAAI/Aquila-7B', _('BAAI’s 7B parameter model'), ModelTypeConst.LLM, v_llm_model_credential,
              VllmChatModel),
    ModelInfo('BAAI/AquilaChat-7B', _('BAAI’s 13B parameter mode'), ModelTypeConst.LLM, v_llm_model_credential,
              VllmChatModel),
 ]
@ -32,7 +39,15 @@ image_model_info_list = [
 ]
 embedding_model_info_list = [
-    ModelInfo('HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5', '', ModelTypeConst.EMBEDDING, embedding_model_credential, VllmEmbeddingModel),
+    ModelInfo('HIT-TMG/KaLM-embedding-multilingual-mini-instruct-v1.5', '', ModelTypeConst.EMBEDDING,
              embedding_model_credential, VllmEmbeddingModel),
 ]
 whisper_model_info_list = [
    ModelInfo('whisper-tiny', '', ModelTypeConst.STT, whisper_model_credential, VllmWhisperSpeechToText),
    ModelInfo('whisper-large-v3-turbo', '', ModelTypeConst.STT, whisper_model_credential, VllmWhisperSpeechToText),
    ModelInfo('whisper-small', '', ModelTypeConst.STT, whisper_model_credential, VllmWhisperSpeechToText),
    ModelInfo('whisper-large-v3', '', ModelTypeConst.STT, whisper_model_credential, VllmWhisperSpeechToText),
 ]
 model_info_manage = (
@ -45,6 +60,8 @@ model_info_manage = (
    .append_default_model_info(image_model_info_list[0])
    .append_model_info_list(embedding_model_info_list)
    .append_default_model_info(embedding_model_info_list[0])
    .append_model_info_list(whisper_model_info_list)
    .append_default_model_info(whisper_model_info_list[0])
    .build()
 )