diff --git a/apps/common/handle/impl/media/media_split_handle.py b/apps/common/handle/impl/media/media_split_handle.py index b1c28a78..a2ab299f 100644 --- a/apps/common/handle/impl/media/media_split_handle.py +++ b/apps/common/handle/impl/media/media_split_handle.py @@ -28,68 +28,20 @@ class MediaSplitHandle(BaseSplitHandle): def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image, **kwargs): - """处理音视频文件 - 使用默认文本""" + """处理音视频文件""" maxkb_logger.info(f"MediaSplitHandle.handle called with file: {file.name}") - maxkb_logger.info(f"Using default text for media processing (no actual audio processing)") - # 获取文件名和类型 - file_name = file.name - file_ext = file_name.lower().split('.')[-1] + # 检查是否需要实际处理 + use_actual_processing = kwargs.get('use_actual_processing', False) + stt_model_id = kwargs.get('stt_model_id') - # 判断媒体类型 - audio_exts = {'mp3', 'wav', 'm4a', 'flac', 'aac', 'ogg', 'wma'} - video_exts = {'mp4', 'avi', 'mov', 'mkv', 'webm', 'flv', 'wmv'} - - if file_ext in audio_exts: - media_type = "音频" - default_segments = self._get_audio_default_segments(file_name) - elif file_ext in video_exts: - media_type = "视频" - default_segments = self._get_video_default_segments(file_name) + if use_actual_processing and stt_model_id: + # 进行实际处理 + return self._handle_actual_processing(file, get_buffer, **kwargs) else: - media_type = "媒体" - default_segments = self._get_media_default_segments(file_name) - - maxkb_logger.info(f"Processing {media_type} file: {file_name}") - maxkb_logger.info(f"Generating {len(default_segments)} default segments") - - # 转换为MaxKB段落格式 - paragraphs = [] - for i, segment_data in enumerate(default_segments): - paragraph = { - 'content': segment_data['content'], - 'title': segment_data['title'], - 'metadata': { - 'start_time': segment_data.get('start_time'), - 'end_time': segment_data.get('end_time'), - 'index': i, - 'is_demo': True, - 'media_type': media_type, - 'file_name': file_name - } - } - paragraphs.append(paragraph) - - # 应用限制 - if limit > 0: - paragraphs = paragraphs[:limit] - - # 添加处理元数据 - metadata = { - 'media_processing_status': 'success', - 'media_type': media_type, - 'is_demo_content': True, - 'processing_mode': 'default_text' - } - - maxkb_logger.info(f"Successfully created {len(paragraphs)} default paragraphs for {file_name}") - - return { - 'name': file.name, - 'content': paragraphs, - 'metadata': metadata - } + # 使用默认文本 + return self._handle_default_text(file, **kwargs) def _get_audio_default_segments(self, file_name: str) -> List[dict]: """生成音频文件的默认段落""" @@ -160,6 +112,163 @@ class MediaSplitHandle(BaseSplitHandle): } ] + def _handle_default_text(self, file, **kwargs) -> dict: + """使用默认文本处理音视频文件""" + + maxkb_logger.info(f"Using default text for media processing: {file.name}") + + # 获取文件名和类型 + file_name = file.name + file_ext = file_name.lower().split('.')[-1] + + # 判断媒体类型 + audio_exts = {'mp3', 'wav', 'm4a', 'flac', 'aac', 'ogg', 'wma'} + video_exts = {'mp4', 'avi', 'mov', 'mkv', 'webm', 'flv', 'wmv'} + + if file_ext in audio_exts: + media_type = "音频" + default_segments = self._get_audio_default_segments(file_name) + elif file_ext in video_exts: + media_type = "视频" + default_segments = self._get_video_default_segments(file_name) + else: + media_type = "媒体" + default_segments = self._get_media_default_segments(file_name) + + maxkb_logger.info(f"Processing {media_type} file: {file_name}") + maxkb_logger.info(f"Generating {len(default_segments)} default segments") + + # 转换为MaxKB段落格式 + paragraphs = [] + for i, segment_data in enumerate(default_segments): + paragraph = { + 'content': segment_data['content'], + 'title': segment_data['title'], + 'metadata': { + 'start_time': segment_data.get('start_time'), + 'end_time': segment_data.get('end_time'), + 'index': i, + 'is_demo': True, + 'media_type': media_type, + 'file_name': file_name + } + } + paragraphs.append(paragraph) + + # 添加处理元数据 + metadata = { + 'media_processing_status': 'success', + 'media_type': media_type, + 'is_demo_content': True, + 'processing_mode': 'default_text' + } + + maxkb_logger.info(f"Successfully created {len(paragraphs)} default paragraphs for {file_name}") + + return { + 'name': file.name, + 'content': paragraphs, + 'metadata': metadata + } + + def _handle_actual_processing(self, file, get_buffer, **kwargs) -> dict: + """实际处理音视频文件""" + + maxkb_logger.info(f"Starting actual processing for media file: {file.name}") + + # 初始化适配器 + if not self.adapter: + from .media_adapter import MediaAdapter + from .media_adapter.logger import MediaLogger + logger_wrapper = MediaLogger(maxkb_logger) + self.adapter = MediaAdapter(logger=logger_wrapper) + + # 获取文件内容 + buffer = get_buffer(file) + + # 获取模型ID和工作空间ID + stt_model_id = kwargs.get('stt_model_id') + llm_model_id = kwargs.get('llm_model_id') + workspace_id = kwargs.get('workspace_id') + + maxkb_logger.info(f"Extracted from kwargs - stt_model_id: {stt_model_id}, llm_model_id: {llm_model_id}, workspace_id: {workspace_id}") + + # 处理选项 + options_param = kwargs.get('options', {}) + options = { + 'language': options_param.get('language', kwargs.get('language', 'auto')), + 'segment_duration': options_param.get('segment_duration', kwargs.get('segment_duration', 300)), + 'enable_punctuation': options_param.get('enable_punctuation', kwargs.get('enable_punctuation', True)), + 'enable_summary': options_param.get('enable_summary', kwargs.get('enable_summary', False)), + 'extract_keyframes': options_param.get('extract_keyframes', kwargs.get('extract_keyframes', False)) + } + + try: + # 调用适配器处理 + result = self.adapter.process_media( + file_content=buffer, + file_name=file.name, + stt_model_id=stt_model_id, + llm_model_id=llm_model_id, + workspace_id=workspace_id, + options=options + ) + + # 转换为MaxKB段落格式 + paragraphs = [] + for segment in result.get('segments', []): + # 使用优化后的文本(如果有) + text = segment.get('enhanced_text', segment.get('text', '')) + + # 添加时间戳信息 + if segment.get('start_time') is not None: + time_info = f"[{self._format_time(segment['start_time'])} - {self._format_time(segment['end_time'])}]" + text = f"{time_info}\n{text}" + + # 添加摘要(如果有) + if segment.get('summary'): + text = f"## 摘要\n\n{segment['summary']}\n\n---\n\n{text}" + maxkb_logger.info(f"Adding summary to paragraph: {segment['summary'][:50]}...") + + paragraph = { + 'content': text, + 'title': f"段落 {segment.get('index', 0) + 1}", + 'metadata': { + 'start_time': segment.get('start_time'), + 'end_time': segment.get('end_time'), + 'index': segment.get('index'), + 'is_demo': False, + 'media_type': 'actual' + } + } + paragraphs.append(paragraph) + + # 添加成功处理的标记 + metadata = result.get('metadata', {}) + metadata['media_processing_status'] = 'success' + metadata['is_demo_content'] = False + metadata['processing_mode'] = 'actual_processing' + + maxkb_logger.info(f"Successfully processed {file.name}, generated {len(paragraphs)} actual paragraphs") + + return { + 'name': file.name, + 'content': paragraphs, + 'metadata': metadata + } + + except Exception as e: + maxkb_logger.error(f"实际处理音视频文件失败: {str(e)}") + # 返回错误信息 + return { + 'name': file.name, + 'content': [{ + 'content': f'实际处理失败: {str(e)}', + 'title': '错误' + }], + 'metadata': {'error': str(e), 'media_processing_status': 'failed'} + } + def get_content(self, file, save_image): """获取文件内容(用于预览)""" try: diff --git a/apps/knowledge/tasks/media_learning.py b/apps/knowledge/tasks/media_learning.py index 56cd8265..53d6f387 100644 --- a/apps/knowledge/tasks/media_learning.py +++ b/apps/knowledge/tasks/media_learning.py @@ -63,119 +63,62 @@ def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id State.STARTED ) - # 生成演示段落数据(不实际处理音频文件) - maxkb_logger.info(f"📝 Generating demo paragraphs for media file: {source_file.file_name}") + # 实际处理音视频文件 + maxkb_logger.info(f"📝 Processing media file: {source_file.file_name}") - # 根据文件类型和名称生成合理的演示段落 - file_extension = source_file.file_name.split('.')[-1].lower() - base_name = source_file.file_name.split('.')[0] - - # 生成演示段落数据 - paragraphs_data = [] - - if file_extension in ['mp3', 'wav', 'm4a', 'aac']: - # 音频文件演示段落 - paragraphs_data = [ - { - 'content': f'这是音频文件 "{base_name}" 的第一段内容演示。本段包含了会议的开场介绍和主要议题的说明。', - 'title': '开场介绍', - 'metadata': { - 'segment_type': 'audio', - 'segment_index': 1, - 'duration': '0:00-2:30', - 'file_name': source_file.file_name, - 'is_demo': True - } - }, - { - 'content': f'这是音频文件 "{base_name}" 的第二段内容演示。本段详细讨论了项目的进展情况和下一步的工作计划。', - 'title': '项目进展', - 'metadata': { - 'segment_type': 'audio', - 'segment_index': 2, - 'duration': '2:30-5:00', - 'file_name': source_file.file_name, - 'is_demo': True - } - }, - { - 'content': f'这是音频文件 "{base_name}" 的第三段内容演示。本段总结了会议的主要结论和行动项,明确了责任人和时间节点。', - 'title': '总结与行动项', - 'metadata': { - 'segment_type': 'audio', - 'segment_index': 3, - 'duration': '5:00-7:30', - 'file_name': source_file.file_name, - 'is_demo': True - } - } - ] - elif file_extension in ['mp4', 'avi', 'mov', 'mkv']: - # 视频文件演示段落 - paragraphs_data = [ - { - 'content': f'这是视频文件 "{base_name}" 的第一段内容演示。本段包含了视频的开场介绍和主要内容概述。', - 'title': '开场介绍', - 'metadata': { - 'segment_type': 'video', - 'segment_index': 1, - 'duration': '0:00-3:00', - 'file_name': source_file.file_name, - 'is_demo': True - } - }, - { - 'content': f'这是视频文件 "{base_name}" 的第二段内容演示。本段详细展示了产品的功能特性和使用方法。', - 'title': '功能演示', - 'metadata': { - 'segment_type': 'video', - 'segment_index': 2, - 'duration': '3:00-8:00', - 'file_name': source_file.file_name, - 'is_demo': True - } - }, - { - 'content': f'这是视频文件 "{base_name}" 的第三段内容演示。本段总结了产品的主要优势和适用场景,提供了联系方式。', - 'title': '总结与联系方式', - 'metadata': { - 'segment_type': 'video', - 'segment_index': 3, - 'duration': '8:00-10:00', - 'file_name': source_file.file_name, - 'is_demo': True - } - } - ] - else: - # 其他类型文件的通用演示段落 - paragraphs_data = [ - { - 'content': f'这是媒体文件 "{base_name}" 的第一段内容演示。本段包含了文件的基本信息和主要内容概述。', - 'title': '文件概述', - 'metadata': { - 'segment_type': 'media', - 'segment_index': 1, - 'duration': '0:00-2:00', - 'file_name': source_file.file_name, - 'is_demo': True - } - }, - { - 'content': f'这是媒体文件 "{base_name}" 的第二段内容演示。本段详细介绍了文件的核心内容和关键信息。', - 'title': '核心内容', - 'metadata': { - 'segment_type': 'media', - 'segment_index': 2, - 'duration': '2:00-4:00', - 'file_name': source_file.file_name, - 'is_demo': True - } + # 使用MediaSplitHandle进行实际处理 + try: + from common.handle.impl.media.media_split_handle import MediaSplitHandle + from django.core.files.base import ContentFile + + # 创建处理器 + handler = MediaSplitHandle() + + # 创建临时文件对象 + temp_file = ContentFile(source_file.get_bytes(), name=source_file.file_name) + + # 获取文件内容的函数 + def get_buffer(file_obj): + return file_obj.read() + + # 处理音视频文件(禁用默认文本模式) + result = handler.handle( + file=temp_file, + pattern_list=[], + with_filter=False, + limit=0, # 不限制段落数量 + get_buffer=get_buffer, + save_image=False, + stt_model_id=stt_model_id, + llm_model_id=llm_model_id, + workspace_id=workspace_id, + use_actual_processing=True # 标记需要实际处理 + ) + + # 提取段落数据 + paragraphs_data = [] + for paragraph in result.get('content', []): + paragraphs_data.append({ + 'content': paragraph['content'], + 'title': paragraph['title'], + 'metadata': paragraph.get('metadata', {}) + }) + + maxkb_logger.info(f"✅ Successfully processed media file, generated {len(paragraphs_data)} paragraphs") + + except Exception as processing_error: + maxkb_logger.error(f"❌ Failed to process media file: {str(processing_error)}") + # 如果处理失败,生成基础段落 + paragraphs_data = [{ + 'content': f'音视频文件 "{source_file.file_name}" 处理失败: {str(processing_error)}', + 'title': '处理失败', + 'metadata': { + 'error': str(processing_error), + 'file_name': source_file.file_name } ] - maxkb_logger.info(f"📝 Generated {len(paragraphs_data)} demo paragraphs for media file") - maxkb_logger.info(f"🔧 Note: Using demo content instead of actual audio/video processing") + maxkb_logger.info(f"📝 Generated {len(paragraphs_data)} paragraphs for media file") # 第2步:更新状态为索引中(段落创建和向量化) maxkb_logger.info(f"📚 Updating status to: STARTED (索引中)") @@ -187,10 +130,11 @@ def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id for idx, para_data in enumerate(paragraphs_data): paragraph = Paragraph( document_id=document_id, + knowledge_id=knowledge_id, content=para_data.get('content', ''), title=para_data.get('title', f'段落 {idx + 1}'), position=idx + 1, - meta=para_data.get('metadata', {}) + status_meta=para_data.get('metadata', {}) ) paragraph_models.append(paragraph)