maxkb/apps/common/handle/impl/media/media_adapter/adapter.py

# -*- coding: utf-8 -*-
"""
音视频处理适配器
复用MaxKB的模型系统，保持模块独立性
"""
import os
import json
import tempfile
from typing import Dict, List, Optional, Any
from concurrent.futures import ThreadPoolExecutor

class MediaAdapter:
    """
    音视频处理适配器
    复用MaxKB的模型系统，保持模块独立性
    """

    def __init__(self, logger=None):
        self.logger = logger or self._get_default_logger()
        from .config import MediaConfig
        self.config = MediaConfig()

    def _get_default_logger(self):
        """获取默认logger"""
        try:
            from common.utils.logger import maxkb_logger
            from .logger import MediaLogger
            return MediaLogger(maxkb_logger)
        except:
            import logging
            from .logger import MediaLogger
            return MediaLogger(logging.getLogger('MediaAdapter'))

    def process_media(self,
                     file_content: bytes,
                     file_name: str,
                     stt_model_id: Optional[str] = None,
                     llm_model_id: Optional[str] = None,
                     workspace_id: Optional[str] = None,
                     options: Dict[str, Any] = None) -> Dict:
        """
        处理音视频文件

        Args:
            file_content: 文件内容
            file_name: 文件名
            stt_model_id: STT模型ID（MaxKB系统中的）
            llm_model_id: LLM模型ID（用于文本优化，可选）
            workspace_id: 工作空间ID
            options: 其他选项
                - language: 语言（zh/en/auto）
                - segment_duration: 分段时长（秒）
                - enable_punctuation: 是否添加标点
                - enable_summary: 是否生成摘要

        Returns:
            {
                'status': 'success',
                'media_type': 'audio/video',
                'duration': 120.5,
                'segments': [
                    {
                        'index': 0,
                        'start_time': 0,
                        'end_time': 60,
                        'text': '转写文本',
                        'enhanced_text': '优化后的文本',
                        'summary': '段落摘要'
                    }
                ],
                'full_text': '完整文本',
                'metadata': {
                    'stt_model': 'model_name',
                    'language': 'zh',
                    'processing_time': 10.5
                }
            }
        """

        options = options or {}
        self.logger.info(f"开始处理媒体文件: {file_name}")
        self.logger.info(f"接收到的参数:")
        self.logger.info(f"  - stt_model_id: {stt_model_id}")
        self.logger.info(f"  - workspace_id: {workspace_id}")
        self.logger.info(f"  - llm_model_id: {llm_model_id}")
        self.logger.info(f"  - options: {options}")
        self.logger.info(f"  - enable_summary in options: {options.get('enable_summary')}")

        try:
            # 判断媒体类型
            media_type = self._detect_media_type(file_name)

            # 获取STT模型实例
            stt_model = None
            if stt_model_id and workspace_id:
                try:
                    from models_provider.tools import get_model_instance_by_model_workspace_id
                    stt_model = get_model_instance_by_model_workspace_id(stt_model_id, workspace_id)
                    self.logger.info(f"成功获取STT模型实例: {stt_model}")
                except Exception as e:
                    self.logger.error(f"获取STT模型失败: {str(e)}")
            else:
                self.logger.warning(f"STT模型未配置 - stt_model_id: {stt_model_id}, workspace_id: {workspace_id}")

            # 获取LLM模型实例（可选）
            llm_model = None
            if llm_model_id and workspace_id:
                try:
                    from models_provider.tools import get_model_instance_by_model_workspace_id
                    llm_model = get_model_instance_by_model_workspace_id(llm_model_id, workspace_id)
                    self.logger.info(f"使用LLM模型: {llm_model_id}")
                except Exception as e:
                    self.logger.warning(f"获取LLM模型失败: {str(e)}")

            # 处理文件
            if media_type == 'video':
                from .processors.video_processor import VideoProcessor
                processor = VideoProcessor(self.config, self.logger)
            else:
                from .processors.audio_processor import AudioProcessor
                processor = AudioProcessor(self.config, self.logger)

            result = processor.process(
                file_content=file_content,
                file_name=file_name,
                stt_model=stt_model,
                llm_model=llm_model,
                options=options
            )

            self.logger.info(f"媒体文件处理成功: {file_name}")
            return result

        except Exception as e:
            self.logger.error(f"处理媒体文件失败: {str(e)}")
            raise

    def _detect_media_type(self, file_name: str) -> str:
        """检测媒体类型"""
        file_ext = file_name.lower().split('.')[-1]
        video_exts = {'mp4', 'avi', 'mov', 'mkv', 'webm', 'flv', 'wmv'}

        if file_ext in video_exts:
            return 'video'
        return 'audio'