diff --git a/apps/common/handle/impl/media/media_split_handle.py b/apps/common/handle/impl/media/media_split_handle.py index dffbe4c6..a80aa70c 100644 --- a/apps/common/handle/impl/media/media_split_handle.py +++ b/apps/common/handle/impl/media/media_split_handle.py @@ -29,19 +29,33 @@ class MediaSplitHandle(BaseSplitHandle): def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image, **kwargs): """处理音视频文件""" - + maxkb_logger.info(f"MediaSplitHandle.handle called with file: {file.name}") - + maxkb_logger.info(f"Split parameters - limit: {limit}, patterns: {pattern_list}, with_filter: {with_filter}") + # 检查是否需要实际处理 use_actual_processing = kwargs.get('use_actual_processing', False) stt_model_id = kwargs.get('stt_model_id') - + if use_actual_processing and stt_model_id: # 进行实际处理 - return self._handle_actual_processing(file, get_buffer, **kwargs) + result = self._handle_actual_processing(file, get_buffer, **kwargs) + + # 应用智能分块(如果需要) + chunk_limit = limit if limit > 0 else 1000 # 默认1000字符 + if len(result.get('content', [])) > 0: + result = self._apply_smart_split(result, chunk_limit, with_filter) + + return result else: # 使用默认文本 - return self._handle_default_text(file, **kwargs) + result = self._handle_default_text(file, **kwargs) + + # 即使是默认文本也应用分块(如果有limit参数) + if limit > 0: + result = self._apply_smart_split(result, limit, with_filter) + + return result def _get_audio_default_segments(self, file_name: str) -> List[dict]: """生成音频文件的默认段落""" @@ -296,4 +310,123 @@ class MediaSplitHandle(BaseSplitHandle): if hours > 0: return f"{hours:02d}:{minutes:02d}:{secs:02d}" else: - return f"{minutes:02d}:{secs:02d}" \ No newline at end of file + return f"{minutes:02d}:{secs:02d}" + + def smart_split_transcription(self, text, limit=1000, overlap=100): + """智能分割转录文本""" + if len(text) <= limit: + return [text] + + # 预处理:移除过多的空行 + text = '\n'.join(line for line in text.split('\n') if line.strip()) + + chunks = [] + start = 0 + + while start < len(text): + # 计算当前块的结束位置 + end = min(start + limit, len(text)) + + # 如果不是最后一块,尝试在句号处断开 + if end < len(text): + # 寻找最佳断点 + best_end = end + + # 优先在句号处断开 + last_period = text.rfind('。', start, end) + if last_period > start + limit // 2: + best_end = last_period + 1 + else: + # 其次在换行符处断开 + last_newline = text.rfind('\n', start, end) + if last_newline > start + limit // 3: + best_end = last_newline + 1 + + end = best_end + + # 提取当前块 + chunk = text[start:end].strip() + if chunk: + chunks.append(chunk) + + # 计算下一块的开始位置(考虑重叠) + start = max(end - overlap, len(chunks[-1]) if chunks else 0) + + # 避免无限循环 + if start >= len(text): + break + + return chunks + + def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False): + """应用智能分块到转录结果""" + overlap = 100 # 前后重叠字符数 + + new_paragraphs = [] + for paragraph in result.get('content', []): + content = paragraph.get('content', '') + + # 应用文本过滤(如果需要) + if with_filter: + content = self._clean_text(content) + + if content: + # 应用智能分块 + chunks = self.smart_split_transcription(content, limit, overlap) + + # 如果只有一个块且没有变化,保持原样 + if len(chunks) == 1 and chunks[0] == content: + new_paragraphs.append(paragraph) + else: + # 创建新的段落 + for idx, chunk in enumerate(chunks): + # 保留原始元数据,但更新分段相关信息 + metadata = paragraph.get('metadata', {}).copy() + metadata.update({ + 'chunk_index': idx, + 'total_chunks': len(chunks), + 'split_method': 'smart_transcription', + 'split_limit': limit, + 'split_overlap': overlap, + 'with_filter': with_filter + }) + + new_paragraph = { + 'content': chunk, + 'title': f"{paragraph.get('title', '段落')} - 第{idx + 1}部分" if len(chunks) > 1 else paragraph.get('title', '段落'), + 'metadata': metadata + } + new_paragraphs.append(new_paragraph) + else: + new_paragraphs.append(paragraph) + + # 更新结果 + result['content'] = new_paragraphs + + # 更新元数据 + metadata = result.get('metadata', {}) + metadata['smart_split_applied'] = True + metadata['total_chunks'] = len(new_paragraphs) + result['metadata'] = metadata + + maxkb_logger.info(f"Applied smart transcription split: {len(new_paragraphs)} chunks") + + return result + + def _clean_text(self, text): + """清理文本:去掉重复多余符号空格、空行、制表符""" + import re + + # 移除多余的空白字符 + text = re.sub(r'\s+', ' ', text) + + # 移除开头和结尾的空白 + text = text.strip() + + # 移除重复的标点符号 + text = re.sub(r'([。!?,])\1+', r'\1', text) + + # 移除多余的换行 + text = re.sub(r'\n{3,}', '\n\n', text) + + return text \ No newline at end of file diff --git a/apps/knowledge/serializers/document.py b/apps/knowledge/serializers/document.py index 62f75aa0..d289f1ec 100644 --- a/apps/knowledge/serializers/document.py +++ b/apps/knowledge/serializers/document.py @@ -249,15 +249,33 @@ def submit_advanced_learning_task(document_id, knowledge_id, workspace_id, llm_m def submit_media_learning_task(document_id, knowledge_id, workspace_id, stt_model_id, llm_model_id): """提交音视频学习任务的辅助函数""" try: + # 获取文档实例以访问分块参数 + from knowledge.models import Document + document = QuerySet(Document).filter(id=document_id).first() + + # 获取分块参数 + limit = None + patterns = None + with_filter = None + + if document and document.meta: + limit = document.meta.get('split_limit') + patterns = document.meta.get('split_patterns', []) + with_filter = document.meta.get('split_with_filter') + from knowledge.tasks.media_learning import media_learning_by_document media_learning_by_document.delay( document_id, knowledge_id, workspace_id, stt_model_id, - llm_model_id + llm_model_id, + limit=limit, + patterns=patterns, + with_filter=with_filter ) maxkb_logger.info(f"Media learning task submitted for document: {document_id}") + maxkb_logger.info(f"Split parameters - limit: {limit}, patterns: {patterns}, with_filter: {with_filter}") except Exception as e: maxkb_logger.error(f"Failed to submit media learning task: {str(e)}") ListenerManagement.update_status( diff --git a/apps/knowledge/tasks/media_learning.py b/apps/knowledge/tasks/media_learning.py index ac322b1c..bee96818 100644 --- a/apps/knowledge/tasks/media_learning.py +++ b/apps/knowledge/tasks/media_learning.py @@ -16,27 +16,34 @@ from common.handle.impl.media.media_split_handle import MediaSplitHandle @shared_task(name='media_learning_by_document') -def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id: str, - stt_model_id: str, llm_model_id: Optional[str] = None): +def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id: str, + stt_model_id: str, llm_model_id: Optional[str] = None, + limit: Optional[int] = None, + patterns: Optional[List[str]] = None, + with_filter: Optional[bool] = None): """ 音视频文档异步处理任务 - 完整状态流转 - + 状态流程: 1. 排队中 (PENDING) - 任务已提交,等待处理 2. 生成中 (STARTED) - 正在转写音视频内容 3. 索引中 (STARTED + 段落创建) - 正在创建段落和索引 4. 完成 (SUCCESS) - 处理完成 5. 失败 (FAILURE) - 处理失败 - + Args: document_id: 文档ID - knowledge_id: 知识库ID + knowledge_id: 知识库ID workspace_id: 工作空间ID stt_model_id: STT模型ID llm_model_id: LLM模型ID(可选) + limit: 分段长度(可选) + patterns: 分段正则列表(可选) + with_filter: 是否清除特殊字符(可选) """ maxkb_logger.info(f"🎬 Starting media learning task for document: {document_id}") maxkb_logger.info(f"📋 Current status: PENDING (排队中)") + maxkb_logger.info(f"📝 Split parameters - limit: {limit}, patterns: {patterns}, with_filter: {with_filter}") try: # 验证文档存在 @@ -81,12 +88,12 @@ def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id def get_buffer(file_obj): return file_obj.read() - # 处理音视频文件(禁用默认文本模式) + # 处理音视频文件(传递分块参数) result = handler.handle( file=temp_file, - pattern_list=[], - with_filter=False, - limit=0, # 不限制段落数量 + pattern_list=patterns or [], # 使用传入的分段模式 + with_filter=with_filter if with_filter is not None else False, + limit=limit if limit is not None else 0, # 0表示使用默认值(在handle中会转为1000) get_buffer=get_buffer, save_image=False, stt_model_id=stt_model_id, @@ -180,24 +187,31 @@ def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id @shared_task(name='media_learning_batch') def media_learning_batch(document_id_list: List[str], knowledge_id: str, workspace_id: str, - stt_model_id: str, llm_model_id: Optional[str] = None): + stt_model_id: str, llm_model_id: Optional[str] = None, + limit: Optional[int] = None, + patterns: Optional[List[str]] = None, + with_filter: Optional[bool] = None): """ 批量音视频处理任务 - + Args: document_id_list: 文档ID列表 knowledge_id: 知识库ID workspace_id: 工作空间ID stt_model_id: STT模型ID llm_model_id: LLM模型ID(可选) + limit: 分段长度(可选) + patterns: 分段正则列表(可选) + with_filter: 是否清除特殊字符(可选) """ maxkb_logger.info(f"🎬 Starting batch media learning for {len(document_id_list)} documents") - + # 为每个文档提交单独的处理任务 for document_id in document_id_list: try: media_learning_by_document.delay( - document_id, knowledge_id, workspace_id, stt_model_id, llm_model_id + document_id, knowledge_id, workspace_id, stt_model_id, llm_model_id, + limit, patterns, with_filter ) maxkb_logger.info(f"📋 Submitted media learning task for document: {document_id}") except Exception as e: