From f1494fedea307d36599619e9f936e8d573a1fcd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= Date: Mon, 25 Aug 2025 01:20:33 +0800 Subject: [PATCH] modify status parsing --- .../serializers/application_chat_record.py | 2 +- apps/common/event/listener_manage.py | 2 +- .../impl/mineru/gbase_adapter/adapter.py | 2 +- .../impl/mineru/maxkb_adapter/adapter.py | 11 +- .../handle/impl/text/mineru_split_handle.py | 30 +- apps/knowledge/api/document.py | 32 +- apps/knowledge/models/knowledge.py | 2 + apps/knowledge/serializers/document.py | 228 ++++++++------ apps/knowledge/serializers/knowledge.py | 6 +- apps/knowledge/serializers/paragraph.py | 4 +- apps/knowledge/serializers/problem.py | 2 +- apps/knowledge/sql/list_document.sql | 6 +- apps/knowledge/task/__init__.py | 0 apps/knowledge/tasks/__init__.py | 5 + apps/knowledge/tasks/advanced_learning.py | 208 +++++++++++++ apps/knowledge/{task => tasks}/embedding.py | 0 apps/knowledge/{task => tasks}/generate.py | 2 +- apps/knowledge/{task => tasks}/handler.py | 0 apps/knowledge/{task => tasks}/sync.py | 6 +- apps/knowledge/urls.py | 1 + apps/knowledge/views/document.py | 38 ++- .../kimi_model_provider/credential/image.py | 79 +++++ .../kimi_model_provider.py | 24 +- .../impl/kimi_model_provider/model/image.py | 32 ++ apps/ops/__init__.py | 12 + dev/Makefile | 67 +++++ dev/sync_mineru_dirs.py | 281 ++++++++++++++++++ ui/src/api/knowledge/document.ts | 30 ++ ui/src/locales/lang/zh-CN/common.ts | 1 + ui/src/locales/lang/zh-CN/views/document.ts | 23 +- ui/src/utils/status.ts | 4 + .../component/AdvancedLearningDialog.vue | 185 ++++++++++++ ui/src/views/document/component/Status.vue | 6 + ui/src/views/document/index.vue | 40 +++ .../views/document/upload/UploadComponent.vue | 6 +- 35 files changed, 1248 insertions(+), 129 deletions(-) delete mode 100644 apps/knowledge/task/__init__.py create mode 100644 apps/knowledge/tasks/__init__.py create mode 100644 apps/knowledge/tasks/advanced_learning.py rename apps/knowledge/{task => tasks}/embedding.py (100%) rename apps/knowledge/{task => tasks}/generate.py (99%) rename apps/knowledge/{task => tasks}/handler.py (100%) rename apps/knowledge/{task => tasks}/sync.py (93%) create mode 100644 apps/models_provider/impl/kimi_model_provider/credential/image.py create mode 100644 apps/models_provider/impl/kimi_model_provider/model/image.py create mode 100644 dev/Makefile create mode 100755 dev/sync_mineru_dirs.py create mode 100644 ui/src/views/document/component/AdvancedLearningDialog.vue diff --git a/apps/application/serializers/application_chat_record.py b/apps/application/serializers/application_chat_record.py index 71500fac..5c66ef46 100644 --- a/apps/application/serializers/application_chat_record.py +++ b/apps/application/serializers/application_chat_record.py @@ -26,7 +26,7 @@ from common.utils.common import post from knowledge.models import Paragraph, Document, Problem, ProblemParagraphMapping, Knowledge from knowledge.serializers.common import get_embedding_model_id_by_knowledge_id, update_document_char_length from knowledge.serializers.paragraph import ParagraphSerializers -from knowledge.task.embedding import embedding_by_paragraph, embedding_by_paragraph_list +from knowledge.tasks.embedding import embedding_by_paragraph, embedding_by_paragraph_list class ChatRecordSerializerModel(serializers.ModelSerializer): diff --git a/apps/common/event/listener_manage.py b/apps/common/event/listener_manage.py index 9aaa29ba..167e5bc4 100644 --- a/apps/common/event/listener_manage.py +++ b/apps/common/event/listener_manage.py @@ -275,7 +275,7 @@ class ListenerManagement: return maxkb_logger.info(_('Start--->Embedding document: {document_id}').format(document_id=document_id) ) - # 批量修改状态为PADDING + # 批量修改状态为STARTED ListenerManagement.update_status(QuerySet(Document).filter(id=document_id), TaskType.EMBEDDING, State.STARTED) diff --git a/apps/common/handle/impl/mineru/gbase_adapter/adapter.py b/apps/common/handle/impl/mineru/gbase_adapter/adapter.py index e3e0d8fd..0f74c37f 100644 --- a/apps/common/handle/impl/mineru/gbase_adapter/adapter.py +++ b/apps/common/handle/impl/mineru/gbase_adapter/adapter.py @@ -11,7 +11,7 @@ GPTBase平台适配器 - 实现GPTBase特定的功能 import contextlib from typing import Any, Dict from .logger import logger - +import os from .gptbase_utils import GPTBaseUtils, GZeroUtils from .config_gptbase import GPTBaseMinerUConfig from ..base_parser import PlatformAdapter, BaseMinerUExtractor, ProcessingResult diff --git a/apps/common/handle/impl/mineru/maxkb_adapter/adapter.py b/apps/common/handle/impl/mineru/maxkb_adapter/adapter.py index 9c86608c..1063a7fa 100644 --- a/apps/common/handle/impl/mineru/maxkb_adapter/adapter.py +++ b/apps/common/handle/impl/mineru/maxkb_adapter/adapter.py @@ -242,7 +242,7 @@ class MinerUAdapter: raise def process_document(self, file_content: bytes, file_name: str, - save_image_func=None) -> Dict[str, Any]: + save_image_func=None, **kwargs) -> Dict[str, Any]: """ 处理文档并返回结构化内容 @@ -250,6 +250,7 @@ class MinerUAdapter: file_content: 文件内容字节流 file_name: 文件名 save_image_func: 保存图片的函数 + **kwargs: 额外参数,包括llm_model_id和vision_model_id Returns: 包含sections的字典,每个section包含content、title和images @@ -276,6 +277,14 @@ class MinerUAdapter: loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: + # 提取模型ID参数 + llm_model_id = kwargs.get('llm_model_id') + vision_model_id = kwargs.get('vision_model_id') + if llm_model_id and vision_model_id: + logger.info(f"使用指定模型处理文档: LLM={llm_model_id}, Vision={vision_model_id}") + # TODO: 将模型ID传递给extractor + # 目前暂时使用默认配置,后续可以在这里设置模型 + result = loop.run_until_complete( self.extractor.process_file(tmp_file_path, file_name) ) diff --git a/apps/common/handle/impl/text/mineru_split_handle.py b/apps/common/handle/impl/text/mineru_split_handle.py index 8d38d96f..64e3736d 100644 --- a/apps/common/handle/impl/text/mineru_split_handle.py +++ b/apps/common/handle/impl/text/mineru_split_handle.py @@ -41,29 +41,43 @@ class MinerUSplitHandle(BaseSplitHandle): return True def handle(self, file, pattern_list: List, with_filter: bool, limit: int, - get_buffer, save_image): + get_buffer, save_image, **kwargs): """ 使用MinerU处理文档 """ try: logger.info(f"MinerUSplitHandle.handle called for file: {file.name if hasattr(file, 'name') else 'unknown'}") - # 初始化MinerU适配器 + # 初始化MinerU适配器,传递模型ID(如果提供) if not self.mineru_adapter: logger.info("Initializing MinerU adapter") + llm_model_id = kwargs.get('llm_model_id') + vision_model_id = kwargs.get('vision_model_id') + if llm_model_id and vision_model_id: + logger.info(f"Using models: LLM={llm_model_id}, Vision={vision_model_id}") self.mineru_adapter = MinerUAdapter() # 获取文件内容 buffer = get_buffer(file) logger.info(f"File buffer size: {len(buffer) if buffer else 0} bytes") - # 处理文档 + # 处理文档,传递模型ID到适配器 logger.info("Calling MinerU adapter to process document") - result = self.mineru_adapter.process_document( - file_content=buffer, - file_name=file.name if hasattr(file, 'name') else 'document.pdf', - save_image_func=save_image - ) + process_kwargs = { + 'file_content': buffer, + 'file_name': file.name if hasattr(file, 'name') else 'document.pdf', + 'save_image_func': save_image + } + + # 如果有模型ID,传递给适配器 + llm_model_id = kwargs.get('llm_model_id') + vision_model_id = kwargs.get('vision_model_id') + if llm_model_id: + process_kwargs['llm_model_id'] = llm_model_id + if vision_model_id: + process_kwargs['vision_model_id'] = vision_model_id + + result = self.mineru_adapter.process_document(**process_kwargs) logger.info(f"MinerU adapter returned result with {len(result.get('sections', []))} sections") # 转换为段落格式 diff --git a/apps/knowledge/api/document.py b/apps/knowledge/api/document.py index eda06d8d..ae744d1e 100644 --- a/apps/knowledge/api/document.py +++ b/apps/knowledge/api/document.py @@ -6,7 +6,8 @@ from common.result import DefaultResultSerializer from knowledge.serializers.common import BatchSerializer from knowledge.serializers.document import DocumentInstanceSerializer, DocumentWebInstanceSerializer, \ CancelInstanceSerializer, BatchCancelInstanceSerializer, DocumentRefreshSerializer, BatchEditHitHandlingSerializer, \ - DocumentBatchRefreshSerializer, DocumentBatchGenerateRelatedSerializer, DocumentMigrateSerializer + DocumentBatchRefreshSerializer, DocumentBatchGenerateRelatedSerializer, DocumentMigrateSerializer, \ + DocumentBatchAdvancedLearningSerializer class DocumentSplitAPI(APIMixin): @@ -473,6 +474,35 @@ class DocumentExportAPI(APIMixin): return DefaultResultSerializer +class BatchAdvancedLearningAPI(APIMixin): + @staticmethod + def get_parameters(): + return [ + OpenApiParameter( + name="workspace_id", + description="工作空间id", + type=OpenApiTypes.STR, + location='path', + required=True, + ), + OpenApiParameter( + name="knowledge_id", + description="知识库id", + type=OpenApiTypes.STR, + location='path', + required=True, + ), + ] + + @staticmethod + def get_request(): + return DocumentBatchAdvancedLearningSerializer + + @staticmethod + def get_response(): + return DefaultResultSerializer + + class DocumentMigrateAPI(APIMixin): @staticmethod def get_parameters(): diff --git a/apps/knowledge/models/knowledge.py b/apps/knowledge/models/knowledge.py index 07206807..8ed86b54 100644 --- a/apps/knowledge/models/knowledge.py +++ b/apps/knowledge/models/knowledge.py @@ -47,6 +47,8 @@ class State(Enum): REVOKE = '4' # 取消成功 REVOKED = '5' + # 解析中 + PARSING = '6' # 忽略 IGNORED = 'n' diff --git a/apps/knowledge/serializers/document.py b/apps/knowledge/serializers/document.py index cd516f5a..44e5141f 100644 --- a/apps/knowledge/serializers/document.py +++ b/apps/knowledge/serializers/document.py @@ -45,16 +45,16 @@ from common.utils.fork import Fork from common.utils.logger import maxkb_logger from common.utils.split_model import get_split_model, flat_map from knowledge.models import Knowledge, Paragraph, Problem, Document, KnowledgeType, ProblemParagraphMapping, State, \ - TaskType, File, FileSourceType + TaskType, File, FileSourceType, get_default_status from knowledge.serializers.common import ProblemParagraphManage, BatchSerializer, \ get_embedding_model_id_by_knowledge_id, MetaSerializer, write_image, zip_dir from knowledge.serializers.paragraph import ParagraphSerializers, ParagraphInstanceSerializer, \ delete_problems_and_mappings -from knowledge.task.embedding import embedding_by_document, delete_embedding_by_document_list, \ +from knowledge.tasks.embedding import embedding_by_document, delete_embedding_by_document_list, \ delete_embedding_by_document, delete_embedding_by_paragraph_ids, embedding_by_document_list, \ update_embedding_knowledge_id -from knowledge.task.generate import generate_related_by_document_id -from knowledge.task.sync import sync_web_document +from knowledge.tasks.generate import generate_related_by_document_id +from knowledge.tasks.sync import sync_web_document from maxkb.const import PROJECT_DIR from models_provider.models import Model from oss.serializers.file import FileSerializer @@ -200,6 +200,12 @@ class DocumentMigrateSerializer(serializers.Serializer): document_id_list = serializers.ListField(required=True, label=_('document id list')) +class DocumentBatchAdvancedLearningSerializer(serializers.Serializer): + id_list = serializers.ListField(required=True, label=_('document id list')) + llm_model = serializers.CharField(required=True, label=_('llm model id')) + vision_model = serializers.CharField(required=True, label=_('vision model id')) + + class BatchEditHitHandlingSerializer(serializers.Serializer): id_list = serializers.ListField(required=True, child=serializers.UUIDField(required=True), label=_('id list')) hit_handling_method = serializers.CharField(required=True, label=_('hit handling method')) @@ -866,6 +872,11 @@ class DocumentSerializers(serializers.Serializer): @staticmethod def get_document_paragraph_model(knowledge_id, instance: Dict): source_meta = {'source_file_id': instance.get('source_file_id')} if instance.get('source_file_id') else {} + # 添加MinerU模型参数到meta + if instance.get('llm_model_id'): + source_meta['llm_model_id'] = instance.get('llm_model_id') + if instance.get('vision_model_id'): + source_meta['vision_model_id'] = instance.get('vision_model_id') meta = {**instance.get('meta'), **source_meta} if instance.get('meta') is not None else source_meta meta = convert_uuid_to_str(meta) @@ -1156,99 +1167,10 @@ class DocumentSerializers(serializers.Serializer): maxkb_logger.info(f"Processing document: {document.get('name')}, llm_model_id: {llm_model_id}, vision_model_id: {vision_model_id}") if llm_model_id or vision_model_id: - maxkb_logger.info(f"Document {document.get('name')} is MinerU type, processing with MinerU handler") - source_file_id = document.get('source_file_id') - maxkb_logger.info(f"Source file ID: {source_file_id}") - if source_file_id: - # 获取源文件 - source_file = QuerySet(File).filter(id=source_file_id).first() - maxkb_logger.info(f"Source file found: {source_file is not None}") - if source_file: - try: - # 使用MinerU处理器重新解析文档 - from common.handle.impl.text.mineru_split_handle import MinerUSplitHandle - from common.utils.split_model import get_split_model - import io - - # 检查MinerU配置 - mineru_api_type = os.environ.get('MINERU_API_TYPE', '') - if not mineru_api_type: - maxkb_logger.warning(f"MinerU API not configured, skipping MinerU processing for document: {document.get('name')}") - continue - - maxkb_logger.info(f"MinerU API configured: {mineru_api_type}") - mineru_handler = MinerUSplitHandle() - - # 获取文件内容 - file_content = source_file.get_bytes() - temp_file = io.BytesIO(file_content) - temp_file.name = source_file.file_name - - # 从现有段落中提取分段模式 - # 获取用户在预览时设置的分段规则 - pattern_list = [] - if 'paragraphs' in document and len(document['paragraphs']) > 0: - # 尝试从元数据或其他地方获取分段模式 - # 这里我们需要从前端传递分段规则 - patterns = document.get('split_patterns', []) - if patterns: - pattern_list = [get_split_model(pattern) for pattern in patterns if pattern] - - def get_buffer(file): - file.seek(0) - return file.read() - - def save_image(image_list): - if image_list is not None and len(image_list) > 0: - exist_image_list = [str(i.get('id')) for i in - QuerySet(File).filter(id__in=[i.id for i in image_list]).values('id')] - save_image_list = [image for image in image_list if not exist_image_list.__contains__(str(image.id))] - save_image_list = list({img.id: img for img in save_image_list}.values()) - for file in save_image_list: - file_bytes = file.meta.pop('content') - file.meta['knowledge_id'] = knowledge_id - file.source_type = FileSourceType.KNOWLEDGE - file.source_id = knowledge_id - file.save(file_bytes) - - # 使用MinerU处理,不传递is_preview参数,这样MinerU会被使用 - maxkb_logger.info(f"Using MinerU to process document: {document.get('name')}") - paragraphs = mineru_handler.handle( - temp_file, - pattern_list, - False, # with_filter - 0, # limit (0表示不限制) - get_buffer, - save_image - ) - - # 如果原来有问题列表,需要保留 - if 'paragraphs' in document and document['paragraphs']: - # 尝试将原有的问题列表合并到新的段落中 - old_problems = {} - for old_para in document['paragraphs']: - if 'content' in old_para and 'problem_list' in old_para: - old_problems[old_para['content'][:100]] = old_para['problem_list'] - - # 为新段落添加问题列表(如果内容相似) - for new_para in paragraphs: - content_key = new_para.get('content', '')[:100] - if content_key in old_problems: - new_para['problem_list'] = old_problems[content_key] - - # 替换文档的段落为MinerU处理后的结果 - maxkb_logger.info(f"MinerU returned {len(paragraphs) if paragraphs else 0} paragraphs") - if paragraphs and len(paragraphs) > 0: - maxkb_logger.info(f"First paragraph sample: {paragraphs[0] if paragraphs else 'None'}") - document['paragraphs'] = paragraphs - else: - maxkb_logger.warning(f"MinerU returned empty paragraphs, keeping original paragraphs") - maxkb_logger.info(f"MinerU processing completed for document: {document.get('name')}, paragraphs count: {len(document.get('paragraphs', []))}") - - except Exception as e: - # 如果MinerU处理失败,保持原有段落 - maxkb_logger.error(f"MinerU processing failed for document {document.get('name')}: {str(e)}", exc_info=True) - # 保持原有段落不变 + maxkb_logger.info(f"Document {document.get('name')} is MinerU type, will process asynchronously") + # MinerU类型的文档,保存基本信息,不处理段落 + # 段落处理将通过异步任务进行 + document['paragraphs'] = [] # 清空段落,等待异步处理 # 插入文档 for document in instance_list: @@ -1270,7 +1192,44 @@ class DocumentSerializers(serializers.Serializer): ) # 插入文档 QuerySet(Document).bulk_create(document_model_list) if len(document_model_list) > 0 else None - # 批量插入段落 + + # 处理高级学习文档的异步任务 + for idx, document in enumerate(instance_list): + llm_model_id = document.get('llm_model_id') + vision_model_id = document.get('vision_model_id') + if llm_model_id and vision_model_id and document_model_list: + # 找到对应的文档模型 + document_model = document_model_list[idx] + maxkb_logger.info(f"Submitting async advanced learning task for document: {document_model.id}") + + # 设置文档状态为解析中 + ListenerManagement.update_status( + QuerySet(Document).filter(id=document_model.id), + TaskType.EMBEDDING, + State.PARSING + ) + + # 提交异步任务 + try: + from knowledge.tasks.advanced_learning import advanced_learning_by_document + advanced_learning_by_document.delay( + str(document_model.id), + str(knowledge_id), + self.data.get('workspace_id', ''), + llm_model_id, + vision_model_id + ) + maxkb_logger.info(f"Advanced learning task submitted for document {document_model.id}") + except Exception as e: + maxkb_logger.error(f"Failed to submit advanced learning task: {str(e)}") + # 如果提交失败,更新状态为失败 + ListenerManagement.update_status( + QuerySet(Document).filter(id=document_model.id), + TaskType.EMBEDDING, + State.FAILURE + ) + + # 批量插入段落(只为非高级学习文档) if len(paragraph_model_list) > 0: for document in document_model_list: max_position = Paragraph.objects.filter(document_id=document.id).aggregate( @@ -1439,6 +1398,79 @@ class DocumentSerializers(serializers.Serializer): pass + class BatchAdvancedLearning(serializers.Serializer): + workspace_id = serializers.CharField(required=True, label=_('workspace id')) + knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id')) + + def is_valid(self, *, raise_exception=False): + super().is_valid(raise_exception=True) + workspace_id = self.data.get('workspace_id') + query_set = QuerySet(Knowledge).filter(id=self.data.get('knowledge_id')) + if workspace_id: + query_set = query_set.filter(workspace_id=workspace_id) + if not query_set.exists(): + raise AppApiException(500, _('Knowledge id does not exist')) + + def batch_advanced_learning(self, instance: Dict, with_valid=True): + if with_valid: + self.is_valid(raise_exception=True) + + document_id_list = instance.get("id_list", []) + llm_model_id = instance.get("llm_model") + vision_model_id = instance.get("vision_model") + + if not document_id_list: + raise AppApiException(500, _('Document list is empty')) + if not llm_model_id or not vision_model_id: + raise AppApiException(500, _('Model selection is required')) + + knowledge_id = self.data.get('knowledge_id') + workspace_id = self.data.get('workspace_id') + + # 获取知识库 + knowledge = QuerySet(Knowledge).filter(id=knowledge_id).first() + if not knowledge: + raise AppApiException(500, _('Knowledge not found')) + + # 检查MinerU配置 + import os + mineru_api_type = os.environ.get('MINERU_API_TYPE', '') + if not mineru_api_type: + raise AppApiException(500, _('MinerU API not configured')) + + # 更新文档状态为解析中(而不是排队中) + for document_id in document_id_list: + ListenerManagement.update_status( + QuerySet(Document).filter(id=document_id), + TaskType.EMBEDDING, + State.PARSING + ) + + # 调用异步任务处理文档 + try: + from knowledge.tasks.advanced_learning import batch_advanced_learning + batch_advanced_learning.delay( + document_id_list, + str(knowledge_id), + workspace_id, + llm_model_id, + vision_model_id + ) + maxkb_logger.info(f"Submitted advanced learning tasks for {len(document_id_list)} documents") + except Exception as e: + maxkb_logger.error(f"Failed to submit advanced learning tasks: {str(e)}") + # 如果提交任务失败,更新状态为失败 + for document_id in document_id_list: + ListenerManagement.update_status( + QuerySet(Document).filter(id=document_id), + TaskType.EMBEDDING, + State.FAILURE + ) + raise AppApiException(500, _('Failed to submit advanced learning tasks')) + + return True + + class FileBufferHandle: buffer = None diff --git a/apps/knowledge/serializers/knowledge.py b/apps/knowledge/serializers/knowledge.py index c76135c0..6162afee 100644 --- a/apps/knowledge/serializers/knowledge.py +++ b/apps/knowledge/serializers/knowledge.py @@ -33,9 +33,9 @@ from knowledge.models import Knowledge, KnowledgeScope, KnowledgeType, Document, from knowledge.serializers.common import ProblemParagraphManage, drop_knowledge_index, get_embedding_model_id_by_knowledge_id, MetaSerializer, \ GenerateRelatedSerializer, get_embedding_model_by_knowledge_id, list_paragraph, write_image, zip_dir from knowledge.serializers.document import DocumentSerializers -from knowledge.task.embedding import embedding_by_knowledge, delete_embedding_by_knowledge -from knowledge.task.generate import generate_related_by_knowledge_id -from knowledge.task.sync import sync_web_knowledge, sync_replace_web_knowledge +from knowledge.tasks.embedding import embedding_by_knowledge, delete_embedding_by_knowledge +from knowledge.tasks.generate import generate_related_by_knowledge_id +from knowledge.tasks.sync import sync_web_knowledge, sync_replace_web_knowledge from maxkb.conf import PROJECT_DIR from models_provider.models import Model from system_manage.models import WorkspaceUserResourcePermission, AuthTargetType diff --git a/apps/knowledge/serializers/paragraph.py b/apps/knowledge/serializers/paragraph.py index dcc09f97..3e06dafb 100644 --- a/apps/knowledge/serializers/paragraph.py +++ b/apps/knowledge/serializers/paragraph.py @@ -18,11 +18,11 @@ from knowledge.models import Paragraph, Problem, Document, ProblemParagraphMappi from knowledge.serializers.common import ProblemParagraphObject, ProblemParagraphManage, \ get_embedding_model_id_by_knowledge_id, update_document_char_length, BatchSerializer from knowledge.serializers.problem import ProblemInstanceSerializer, ProblemSerializer, ProblemSerializers -from knowledge.task.embedding import embedding_by_paragraph, enable_embedding_by_paragraph, \ +from knowledge.tasks.embedding import embedding_by_paragraph, enable_embedding_by_paragraph, \ disable_embedding_by_paragraph, \ delete_embedding_by_paragraph, embedding_by_problem as embedding_by_problem_task, delete_embedding_by_paragraph_ids, \ embedding_by_problem, delete_embedding_by_source, update_embedding_document_id -from knowledge.task.generate import generate_related_by_paragraph_id_list +from knowledge.tasks.generate import generate_related_by_paragraph_id_list class ParagraphSerializer(serializers.ModelSerializer): diff --git a/apps/knowledge/serializers/problem.py b/apps/knowledge/serializers/problem.py index 533258fc..02a7baaf 100644 --- a/apps/knowledge/serializers/problem.py +++ b/apps/knowledge/serializers/problem.py @@ -13,7 +13,7 @@ from common.exception.app_exception import AppApiException from common.utils.common import get_file_content from knowledge.models import Problem, ProblemParagraphMapping, Paragraph, Knowledge, SourceType from knowledge.serializers.common import get_embedding_model_id_by_knowledge_id -from knowledge.task.embedding import delete_embedding_by_source_ids, update_problem_embedding, embedding_by_data_list +from knowledge.tasks.embedding import delete_embedding_by_source_ids, update_problem_embedding, embedding_by_data_list from maxkb.const import PROJECT_DIR diff --git a/apps/knowledge/sql/list_document.sql b/apps/knowledge/sql/list_document.sql index 8b7891bf..424c17df 100644 --- a/apps/knowledge/sql/list_document.sql +++ b/apps/knowledge/sql/list_document.sql @@ -3,7 +3,11 @@ SELECT "document".* , to_json("document"."meta") as meta, to_json("document"."status_meta") as status_meta, - (SELECT "count"("id") FROM "paragraph" WHERE document_id="document"."id") as "paragraph_count" + (SELECT "count"("id") FROM "paragraph" WHERE document_id="document"."id") as "paragraph_count", + CASE + WHEN "document"."meta"->>'llm_model_id' IS NOT NULL THEN 'advanced' + ELSE 'regular' + END as "learning_type" FROM "document" "document" ${document_custom_sql} diff --git a/apps/knowledge/task/__init__.py b/apps/knowledge/task/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/apps/knowledge/tasks/__init__.py b/apps/knowledge/tasks/__init__.py new file mode 100644 index 00000000..70835501 --- /dev/null +++ b/apps/knowledge/tasks/__init__.py @@ -0,0 +1,5 @@ +# coding=utf-8 + +# Import tasks for Celery discovery +# Note: We import the specific tasks, not * to avoid circular imports +from .advanced_learning import advanced_learning_by_document, batch_advanced_learning \ No newline at end of file diff --git a/apps/knowledge/tasks/advanced_learning.py b/apps/knowledge/tasks/advanced_learning.py new file mode 100644 index 00000000..8c30dd39 --- /dev/null +++ b/apps/knowledge/tasks/advanced_learning.py @@ -0,0 +1,208 @@ +# coding=utf-8 +""" +高级学习任务 - 使用MinerU重新解析文档 +""" +import traceback +import uuid as uuid_lib +from typing import List + +from celery_once import QueueOnce +from django.db.models import QuerySet + +from ops import celery_app + + +@celery_app.task(name='celery:advanced_learning_by_document') +def advanced_learning_by_document(document_id: str, knowledge_id: str, workspace_id: str, + llm_model_id: str, vision_model_id: str): + """ + 使用MinerU高级学习处理文档 + + @param document_id: 文档ID + @param knowledge_id: 知识库ID + @param workspace_id: 工作空间ID + @param llm_model_id: 大语言模型ID + @param vision_model_id: 视觉模型ID + """ + # 延迟导入,避免循环依赖 + from common.event import ListenerManagement + from common.utils.logger import maxkb_logger + from knowledge.models import Document, Knowledge, Paragraph, State, TaskType, File, FileSourceType, get_default_status + from knowledge.serializers.paragraph import delete_problems_and_mappings + from knowledge.tasks.embedding import delete_embedding_by_document, embedding_by_document + + maxkb_logger.info(f"Starting advanced learning for document {document_id}") + + try: + # 立即更新状态为解析中 + from common.event import ListenerManagement + ListenerManagement.update_status( + QuerySet(Document).filter(id=document_id), + TaskType.EMBEDDING, + State.PARSING + ) + maxkb_logger.info(f"Updated document {document_id} status to PARSING") + + # 获取文档 + document = QuerySet(Document).filter(id=document_id).first() + if not document: + maxkb_logger.error(f"Document {document_id} not found") + return + + # 获取知识库 + knowledge = QuerySet(Knowledge).filter(id=knowledge_id).first() + if not knowledge: + maxkb_logger.error(f"Knowledge {knowledge_id} not found") + return + + # 获取源文件 + source_file_id = document.meta.get('source_file_id') + if not source_file_id: + maxkb_logger.warning(f"No source file for document {document.id}") + ListenerManagement.update_status( + QuerySet(Document).filter(id=document_id), + TaskType.EMBEDDING, + State.FAILURE + ) + return + + source_file = QuerySet(File).filter(id=source_file_id).first() + if not source_file: + maxkb_logger.warning(f"Source file not found for document {document.id}") + ListenerManagement.update_status( + QuerySet(Document).filter(id=document_id), + TaskType.EMBEDDING, + State.FAILURE + ) + return + + # 删除现有的段落和向量数据 + QuerySet(Paragraph).filter(document_id=document_id).delete() + delete_problems_and_mappings([document_id]) + delete_embedding_by_document(document_id) + + # 更新文档元数据,记录使用的模型 + document.meta['llm_model_id'] = llm_model_id + document.meta['vision_model_id'] = vision_model_id + document.save() + + # 使用MinerU重新解析文档 + from common.handle.impl.text.mineru_split_handle import MinerUSplitHandle + import io + + mineru_handler = MinerUSplitHandle() + + # 获取文件内容 + file_content = source_file.get_bytes() + temp_file = io.BytesIO(file_content) + temp_file.name = source_file.file_name + + def get_buffer(file): + file.seek(0) + return file.read() + + def save_image(image_list): + if image_list is not None and len(image_list) > 0: + exist_image_list = [str(i.get('id')) for i in + QuerySet(File).filter(id__in=[i.id for i in image_list]).values('id')] + save_image_list = [image for image in image_list if not exist_image_list.__contains__(str(image.id))] + save_image_list = list({img.id: img for img in save_image_list}.values()) + for file in save_image_list: + file_bytes = file.meta.pop('content') + file.meta['knowledge_id'] = knowledge_id + file.source_type = FileSourceType.KNOWLEDGE + file.source_id = knowledge_id + file.save(file_bytes) + + # 使用MinerU处理文档 + maxkb_logger.info(f"Using MinerU to reprocess document: {document.name}") + paragraphs = mineru_handler.handle( + temp_file, + [], # pattern_list + False, # with_filter + 0, # limit (0表示不限制) + get_buffer, + save_image, + llm_model_id=llm_model_id, + vision_model_id=vision_model_id + ) + + if paragraphs and len(paragraphs) > 0: + # 创建新的段落 + paragraph_model_list = [] + for index, paragraph in enumerate(paragraphs): + paragraph_instance = Paragraph( + id=uuid_lib.uuid4(), + document_id=document_id, + knowledge_id=knowledge_id, + content=paragraph.get('content', ''), + title=paragraph.get('title', ''), + status=get_default_status(), + is_active=True, + hit_num=0, + position=index + 1 + ) + if 'image_list' in paragraph: + paragraph_instance.image_list = paragraph['image_list'] + paragraph_model_list.append(paragraph_instance) + + # 批量插入段落 + QuerySet(Paragraph).bulk_create(paragraph_model_list) + + # 更新文档字符数 + char_length = sum([len(p.content) for p in paragraph_model_list]) + document.char_length = char_length + document.save() + + # MinerU解析完成,启动向量化任务 + embedding_model_id = knowledge.embedding_model_id + maxkb_logger.info(f"Starting embedding for document {document_id} after MinerU parsing") + + # 调用向量化任务,此时embedding_by_document会自动将状态从PARSING更新为STARTED + embedding_by_document.delay( + str(document_id), + str(embedding_model_id) + ) + + maxkb_logger.info(f"MinerU reprocessing completed for document: {document.name}, " + f"created {len(paragraph_model_list)} paragraphs") + else: + maxkb_logger.warning(f"MinerU returned no paragraphs for document: {document.name}") + # 更新状态为失败 + ListenerManagement.update_status( + QuerySet(Document).filter(id=document_id), + TaskType.EMBEDDING, + State.FAILURE + ) + + except Exception as e: + maxkb_logger.error(f"Failed to process document {document_id}: {str(e)}", exc_info=True) + # 更新状态为失败 + ListenerManagement.update_status( + QuerySet(Document).filter(id=document_id), + TaskType.EMBEDDING, + State.FAILURE + ) + + +@celery_app.task(name='celery:batch_advanced_learning') +def batch_advanced_learning(document_id_list: List[str], knowledge_id: str, workspace_id: str, + llm_model_id: str, vision_model_id: str): + """ + 批量高级学习任务 + + @param document_id_list: 文档ID列表 + @param knowledge_id: 知识库ID + @param workspace_id: 工作空间ID + @param llm_model_id: 大语言模型ID + @param vision_model_id: 视觉模型ID + """ + from common.utils.logger import maxkb_logger + maxkb_logger.info(f"batch_advanced_learning called with {len(document_id_list)} documents") + + for document_id in document_id_list: + maxkb_logger.info(f"Submitting advanced_learning_by_document for document {document_id}") + advanced_learning_by_document.apply_async( + args=[str(document_id), str(knowledge_id), workspace_id, llm_model_id, vision_model_id], + queue='celery' + ) \ No newline at end of file diff --git a/apps/knowledge/task/embedding.py b/apps/knowledge/tasks/embedding.py similarity index 100% rename from apps/knowledge/task/embedding.py rename to apps/knowledge/tasks/embedding.py diff --git a/apps/knowledge/task/generate.py b/apps/knowledge/tasks/generate.py similarity index 99% rename from apps/knowledge/task/generate.py rename to apps/knowledge/tasks/generate.py index e69a2ea8..fe2a9fb5 100644 --- a/apps/knowledge/task/generate.py +++ b/apps/knowledge/tasks/generate.py @@ -12,7 +12,7 @@ from common.event import ListenerManagement from common.utils.logger import maxkb_logger from common.utils.page_utils import page, page_desc from knowledge.models import Paragraph, Document, Status, TaskType, State -from knowledge.task.handler import save_problem +from knowledge.tasks.handler import save_problem from models_provider.models import Model from models_provider.tools import get_model from ops import celery_app diff --git a/apps/knowledge/task/handler.py b/apps/knowledge/tasks/handler.py similarity index 100% rename from apps/knowledge/task/handler.py rename to apps/knowledge/tasks/handler.py diff --git a/apps/knowledge/task/sync.py b/apps/knowledge/tasks/sync.py similarity index 93% rename from apps/knowledge/task/sync.py rename to apps/knowledge/tasks/sync.py index 20d92386..0fc8a274 100644 --- a/apps/knowledge/task/sync.py +++ b/apps/knowledge/tasks/sync.py @@ -23,7 +23,7 @@ from ops import celery_app @celery_app.task(base=QueueOnce, once={'keys': ['knowledge_id']}, name='celery:sync_web_knowledge') def sync_web_knowledge(knowledge_id: str, url: str, selector: str): - from knowledge.task.handler import get_save_handler + from knowledge.tasks.handler import get_save_handler try: maxkb_logger.info( @@ -40,7 +40,7 @@ def sync_web_knowledge(knowledge_id: str, url: str, selector: str): @celery_app.task(base=QueueOnce, once={'keys': ['knowledge_id']}, name='celery:sync_replace_web_knowledge') def sync_replace_web_knowledge(knowledge_id: str, url: str, selector: str): - from knowledge.task.handler import get_sync_handler + from knowledge.tasks.handler import get_sync_handler try: maxkb_logger.info( @@ -56,7 +56,7 @@ def sync_replace_web_knowledge(knowledge_id: str, url: str, selector: str): @celery_app.task(name='celery:sync_web_document') def sync_web_document(knowledge_id, source_url_list: List[str], selector: str): - from knowledge.task.handler import get_sync_web_document_handler + from knowledge.tasks.handler import get_sync_web_document_handler handler = get_sync_web_document_handler(knowledge_id) for source_url in source_url_list: diff --git a/apps/knowledge/urls.py b/apps/knowledge/urls.py index 64f34bd2..d6ca1db7 100644 --- a/apps/knowledge/urls.py +++ b/apps/knowledge/urls.py @@ -26,6 +26,7 @@ urlpatterns = [ path('workspace//knowledge//document/batch_sync', views.DocumentView.BatchSync.as_view()), path('workspace//knowledge//document/batch_delete', views.DocumentView.BatchDelete.as_view()), path('workspace//knowledge//document/batch_refresh', views.DocumentView.BatchRefresh.as_view()), + path('workspace//knowledge//document/batch_advanced_learning', views.BatchAdvancedLearning.as_view()), path('workspace//knowledge//document/batch_generate_related', views.DocumentView.BatchGenerateRelated.as_view()), path('workspace//knowledge//document/web', views.WebDocumentView.as_view()), path('workspace//knowledge//document/qa', views.QaDocumentView.as_view()), diff --git a/apps/knowledge/views/document.py b/apps/knowledge/views/document.py index d430f505..bc5707e6 100644 --- a/apps/knowledge/views/document.py +++ b/apps/knowledge/views/document.py @@ -13,7 +13,7 @@ from knowledge.api.document import DocumentSplitAPI, DocumentBatchAPI, DocumentB DocumentReadAPI, DocumentEditAPI, DocumentDeleteAPI, TableDocumentCreateAPI, QaDocumentCreateAPI, \ WebDocumentCreateAPI, CancelTaskAPI, BatchCancelTaskAPI, SyncWebAPI, RefreshAPI, BatchEditHitHandlingAPI, \ DocumentTreeReadAPI, DocumentSplitPatternAPI, BatchRefreshAPI, BatchGenerateRelatedAPI, TemplateExportAPI, \ - DocumentExportAPI, DocumentMigrateAPI, DocumentDownloadSourceAPI + DocumentExportAPI, DocumentMigrateAPI, DocumentDownloadSourceAPI, BatchAdvancedLearningAPI from knowledge.serializers.common import get_knowledge_operation_object from knowledge.serializers.document import DocumentSerializers from knowledge.views.common import get_knowledge_document_operation_object, get_document_operation_object_batch, \ @@ -817,3 +817,39 @@ class TableTemplate(APIView): tags=[_('Knowledge Base/Documentation')]) # type: ignore def get(self, request: Request): return DocumentSerializers.Export(data={'type': request.query_params.get('type')}).table_export(with_valid=True) + + +class BatchAdvancedLearning(APIView): + authentication_classes = [TokenAuth] + + @extend_schema( + methods=['PUT'], + summary=_('Batch advanced learning with MinerU'), + operation_id=_('Batch advanced learning with MinerU'), # type: ignore + request=BatchAdvancedLearningAPI.get_request(), + parameters=BatchAdvancedLearningAPI.get_parameters(), + responses=BatchAdvancedLearningAPI.get_response(), + tags=[_('Knowledge Base/Documentation')] # type: ignore + ) + @has_permissions( + PermissionConstants.KNOWLEDGE_DOCUMENT_VECTOR.get_workspace_knowledge_permission(), + PermissionConstants.KNOWLEDGE_DOCUMENT_VECTOR.get_workspace_permission_workspace_manage_role(), + PermissionConstants.KNOWLEDGE_DOCUMENT_EDIT.get_workspace_knowledge_permission(), + PermissionConstants.KNOWLEDGE_DOCUMENT_EDIT.get_workspace_permission_workspace_manage_role(), + RoleConstants.WORKSPACE_MANAGE.get_workspace_role(), + ViewPermission([RoleConstants.USER.get_workspace_role()], + [PermissionConstants.KNOWLEDGE.get_workspace_knowledge_permission()], CompareConstants.AND), + ) + @log( + menu='document', operate="Batch advanced learning", + get_operation_object=lambda r, keywords: get_knowledge_document_operation_object( + get_knowledge_operation_object(keywords.get('knowledge_id')), + get_document_operation_object_batch(r.data.get('id_list')), + ), + ) + def put(self, request: Request, workspace_id: str, knowledge_id: str): + return result.success( + DocumentSerializers.BatchAdvancedLearning( + data={'workspace_id': workspace_id, 'knowledge_id': knowledge_id} + ).batch_advanced_learning(request.data) + ) diff --git a/apps/models_provider/impl/kimi_model_provider/credential/image.py b/apps/models_provider/impl/kimi_model_provider/credential/image.py new file mode 100644 index 00000000..962009a3 --- /dev/null +++ b/apps/models_provider/impl/kimi_model_provider/credential/image.py @@ -0,0 +1,79 @@ +# coding=utf-8 +""" + @project: MaxKB + @Author:MaxKB Team + @file: image.py + @date:2024/12/20 + @desc: Kimi Image/Vision Model Credential +""" +import traceback +from typing import Dict + +from django.utils.translation import gettext_lazy as _, gettext +from langchain_core.messages import HumanMessage + +from common import forms +from common.exception.app_exception import AppApiException +from common.forms import BaseForm, TooltipLabel +from models_provider.base_model_provider import BaseModelCredential, ValidCode + + +class KimiImageModelParams(BaseForm): + temperature = forms.SliderField(TooltipLabel(_('Temperature'), + _('Higher values make the output more random, while lower values make it more focused and deterministic')), + required=True, default_value=0.3, + _min=0.1, + _max=1.0, + _step=0.01, + precision=2) + + max_tokens = forms.SliderField( + TooltipLabel(_('Output the maximum Tokens'), + _('Specify the maximum number of tokens that the model can generate')), + required=True, default_value=1024, + _min=1, + _max=100000, + _step=1, + precision=0) + + +class KimiImageModelCredential(BaseForm, BaseModelCredential): + api_base = forms.TextInputField('API URL', required=True) + api_key = forms.PasswordInputField('API Key', required=True) + + def is_valid(self, model_type: str, model_name, model_credential: Dict[str, object], model_params, provider, + raise_exception=False): + model_type_list = provider.get_model_type_list() + if not any(list(filter(lambda mt: mt.get('value') == model_type, model_type_list))): + raise AppApiException(ValidCode.valid_error.value, + gettext('{model_type} Model type is not supported').format(model_type=model_type)) + + for key in ['api_base', 'api_key']: + if key not in model_credential: + if raise_exception: + raise AppApiException(ValidCode.valid_error.value, gettext('{key} is required').format(key=key)) + else: + return False + try: + model = provider.get_model(model_type, model_name, model_credential, **model_params) + res = model.stream([HumanMessage(content=[{"type": "text", "text": gettext('Hello')}])]) + for chunk in res: + pass # Just consume the stream to validate + except Exception as e: + traceback.print_exc() + if isinstance(e, AppApiException): + raise e + if raise_exception: + raise AppApiException(ValidCode.valid_error.value, + gettext( + 'Verification failed, please check whether the parameters are correct: {error}').format( + error=str(e))) + else: + return False + return True + + def encryption_dict(self, model: Dict[str, object]): + return {**model, 'api_key': super().encryption(model.get('api_key', ''))} + + def get_model_params_setting_form(self, model_name): + return KimiImageModelParams() \ No newline at end of file diff --git a/apps/models_provider/impl/kimi_model_provider/kimi_model_provider.py b/apps/models_provider/impl/kimi_model_provider/kimi_model_provider.py index e1ab6d7f..681e8caa 100644 --- a/apps/models_provider/impl/kimi_model_provider/kimi_model_provider.py +++ b/apps/models_provider/impl/kimi_model_provider/kimi_model_provider.py @@ -12,11 +12,15 @@ from common.utils.common import get_file_content from models_provider.base_model_provider import IModelProvider, ModelProvideInfo, ModelInfo, \ ModelTypeConst, ModelInfoManage from models_provider.impl.kimi_model_provider.credential.llm import KimiLLMModelCredential +from models_provider.impl.kimi_model_provider.credential.image import KimiImageModelCredential from models_provider.impl.kimi_model_provider.model.llm import KimiChatModel +from models_provider.impl.kimi_model_provider.model.image import KimiImageModel from maxkb.conf import PROJECT_DIR kimi_llm_model_credential = KimiLLMModelCredential() +kimi_image_model_credential = KimiImageModelCredential() +# LLM Models moonshot_v1_8k = ModelInfo('moonshot-v1-8k', '', ModelTypeConst.LLM, kimi_llm_model_credential, KimiChatModel) moonshot_v1_32k = ModelInfo('moonshot-v1-32k', '', ModelTypeConst.LLM, kimi_llm_model_credential, @@ -24,8 +28,24 @@ moonshot_v1_32k = ModelInfo('moonshot-v1-32k', '', ModelTypeConst.LLM, kimi_llm_ moonshot_v1_128k = ModelInfo('moonshot-v1-128k', '', ModelTypeConst.LLM, kimi_llm_model_credential, KimiChatModel) -model_info_manage = ModelInfoManage.builder().append_model_info(moonshot_v1_8k).append_model_info( - moonshot_v1_32k).append_default_model_info(moonshot_v1_128k).append_default_model_info(moonshot_v1_8k).build() +# Vision/Image Models +moonshot_v1_8k_vision = ModelInfo('moonshot-v1-8k-vision-preview', 'Kimi 视觉模型 8K', ModelTypeConst.IMAGE, + kimi_image_model_credential, KimiImageModel) +moonshot_v1_32k_vision = ModelInfo('moonshot-v1-32k-vision-preview', 'Kimi 视觉模型 32K', ModelTypeConst.IMAGE, + kimi_image_model_credential, KimiImageModel) +moonshot_v1_128k_vision = ModelInfo('moonshot-v1-128k-vision-preview', 'Kimi 视觉模型 128K', ModelTypeConst.IMAGE, + kimi_image_model_credential, KimiImageModel) + +model_info_manage = (ModelInfoManage.builder() + .append_model_info(moonshot_v1_8k) + .append_model_info(moonshot_v1_32k) + .append_default_model_info(moonshot_v1_128k) + .append_default_model_info(moonshot_v1_8k) + .append_model_info(moonshot_v1_8k_vision) + .append_model_info(moonshot_v1_32k_vision) + .append_model_info(moonshot_v1_128k_vision) + .append_default_model_info(moonshot_v1_128k_vision) + .build()) class KimiModelProvider(IModelProvider): diff --git a/apps/models_provider/impl/kimi_model_provider/model/image.py b/apps/models_provider/impl/kimi_model_provider/model/image.py new file mode 100644 index 00000000..d2f9f8c6 --- /dev/null +++ b/apps/models_provider/impl/kimi_model_provider/model/image.py @@ -0,0 +1,32 @@ +# coding=utf-8 +""" + @project: MaxKB + @Author:MaxKB Team + @file: image.py + @date:2024/12/20 + @desc: Kimi Image/Vision Model Implementation +""" +from typing import Dict + +from models_provider.base_model_provider import MaxKBBaseModel +from models_provider.impl.base_chat_open_ai import BaseChatOpenAI + + +class KimiImageModel(MaxKBBaseModel, BaseChatOpenAI): + + @staticmethod + def is_cache_model(): + return False + + @staticmethod + def new_instance(model_type, model_name, model_credential: Dict[str, object], **model_kwargs): + optional_params = MaxKBBaseModel.filter_optional_params(model_kwargs) + + return KimiImageModel( + model_name=model_name, + openai_api_base=model_credential.get('api_base'), + openai_api_key=model_credential.get('api_key'), + streaming=True, + stream_usage=True, + extra_body=optional_params + ) \ No newline at end of file diff --git a/apps/ops/__init__.py b/apps/ops/__init__.py index a02f13af..579bec2c 100644 --- a/apps/ops/__init__.py +++ b/apps/ops/__init__.py @@ -7,3 +7,15 @@ @desc: """ from .celery import app as celery_app + +# Import and register advanced learning tasks +try: + from knowledge.tasks.advanced_learning import ( + advanced_learning_by_document, + batch_advanced_learning + ) + # Register tasks with the celery app + celery_app.register_task(advanced_learning_by_document) + celery_app.register_task(batch_advanced_learning) +except ImportError: + pass diff --git a/dev/Makefile b/dev/Makefile new file mode 100644 index 00000000..60e83619 --- /dev/null +++ b/dev/Makefile @@ -0,0 +1,67 @@ +# MaxKB 开发环境快捷命令 + +# 快速重启Python进程(最常用) +restart: + @echo "🔄 重启MaxKB Python进程..." + @docker exec maxkb-dev pkill -f "python.*main.py" 2>/dev/null || true + @sleep 2 + @echo "✅ 重启完成" + +# 查看实时日志 +logs: + docker logs -f maxkb-dev + +# 查看最新20行日志 +log: + docker logs maxkb-dev --tail 20 + +# 进入容器shell +shell: + docker exec -it maxkb-dev bash + +# 查看Python进程 +ps: + @docker exec maxkb-dev ps aux | grep python | grep -v grep || echo "没有找到Python进程" + +# 完全重启容器 +full-restart: + docker compose -f docker-compose-simple.yml restart + +# 停止容器 +stop: + docker compose -f docker-compose-simple.yml stop + +# 启动容器 +start: + docker compose -f docker-compose-simple.yml start + +# 重新构建并启动 +rebuild: + docker compose -f docker-compose-simple.yml down + docker compose -f docker-compose-simple.yml up -d --build + +# 查看容器状态 +status: + @docker ps --filter name=maxkb-dev --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" + +# 清理日志 +clean-logs: + @docker exec maxkb-dev bash -c "find /opt/maxkb-app -name '*.log' -type f -exec truncate -s 0 {} \;" 2>/dev/null || true + @echo "日志已清理" + +# 帮助信息 +help: + @echo "MaxKB 开发环境快捷命令:" + @echo " make restart - 快速重启Python进程(最快)" + @echo " make logs - 查看实时日志" + @echo " make log - 查看最新20行日志" + @echo " make shell - 进入容器Shell" + @echo " make ps - 查看Python进程" + @echo " make full-restart - 完全重启容器" + @echo " make stop - 停止容器" + @echo " make start - 启动容器" + @echo " make rebuild - 重新构建并启动" + @echo " make status - 查看容器状态" + @echo " make clean-logs - 清理日志文件" + +.DEFAULT_GOAL := help \ No newline at end of file diff --git a/dev/sync_mineru_dirs.py b/dev/sync_mineru_dirs.py new file mode 100755 index 00000000..25a09e58 --- /dev/null +++ b/dev/sync_mineru_dirs.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python3 +""" +双向同步脚本:在两个 mineru 目录之间进行文件同步 +目录1: ~/Documents/felo/gptbase-parser/loader/mineru +目录2: /Users/moshui/Documents/felo/moshui/MaxKB/apps/common/handle/impl/mineru +""" + +import os +import sys +import time +import shutil +import hashlib +import argparse +from pathlib import Path +from datetime import datetime +from typing import Set, Tuple, Optional +import subprocess + +try: + from watchdog.observers import Observer + from watchdog.events import FileSystemEventHandler + WATCHDOG_AVAILABLE = True +except ImportError: + WATCHDOG_AVAILABLE = False + print("警告: watchdog 未安装,实时监控功能不可用") + print("运行 'pip install watchdog' 来启用实时监控功能") + +class DirectorySyncer: + def __init__(self, dir1: str, dir2: str, verbose: bool = False): + self.dir1 = Path(dir1).expanduser().resolve() + self.dir2 = Path(dir2).expanduser().resolve() + self.verbose = verbose + self.exclude_patterns = { + '__pycache__', + '.DS_Store', + '*.pyc', + '*.pyo', + '.git', + '.idea', + '.vscode', + '*.swp', + '*.swo', + '*~' + } + + def should_exclude(self, path: Path) -> bool: + """检查文件或目录是否应该被排除""" + name = path.name + + for pattern in self.exclude_patterns: + if pattern.startswith('*'): + if name.endswith(pattern[1:]): + return True + elif pattern.endswith('*'): + if name.startswith(pattern[:-1]): + return True + elif name == pattern: + return True + + return False + + def get_file_hash(self, filepath: Path) -> str: + """计算文件的 MD5 哈希值""" + hash_md5 = hashlib.md5() + try: + with open(filepath, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + except Exception as e: + if self.verbose: + print(f"无法计算 {filepath} 的哈希值: {e}") + return "" + + def get_relative_files(self, directory: Path) -> Set[Path]: + """获取目录中所有文件的相对路径集合""" + files = set() + for item in directory.rglob("*"): + if self.should_exclude(item): + continue + if item.is_file(): + relative_path = item.relative_to(directory) + files.add(relative_path) + return files + + def sync_file(self, source: Path, dest: Path, relative_path: Path) -> bool: + """同步单个文件""" + source_file = source / relative_path + dest_file = dest / relative_path + + try: + dest_file.parent.mkdir(parents=True, exist_ok=True) + + if not dest_file.exists(): + shutil.copy2(source_file, dest_file) + if self.verbose: + print(f"复制: {relative_path}") + return True + else: + source_hash = self.get_file_hash(source_file) + dest_hash = self.get_file_hash(dest_file) + + if source_hash != dest_hash: + source_mtime = source_file.stat().st_mtime + dest_mtime = dest_file.stat().st_mtime + + if source_mtime > dest_mtime: + shutil.copy2(source_file, dest_file) + if self.verbose: + print(f"更新: {relative_path} (源文件较新)") + return True + elif self.verbose: + print(f"跳过: {relative_path} (目标文件较新或相同)") + + except Exception as e: + print(f"错误同步 {relative_path}: {e}") + return False + + return False + + def sync_directories(self) -> Tuple[int, int]: + """执行双向同步""" + print(f"\n开始同步...") + print(f"目录1: {self.dir1}") + print(f"目录2: {self.dir2}") + print("-" * 60) + + files1 = self.get_relative_files(self.dir1) + files2 = self.get_relative_files(self.dir2) + + all_files = files1 | files2 + synced_count = 0 + deleted_count = 0 + + for rel_path in all_files: + file1 = self.dir1 / rel_path + file2 = self.dir2 / rel_path + + if file1.exists() and not file2.exists(): + if self.sync_file(self.dir1, self.dir2, rel_path): + synced_count += 1 + print(f"→ {rel_path}") + + elif file2.exists() and not file1.exists(): + if self.sync_file(self.dir2, self.dir1, rel_path): + synced_count += 1 + print(f"← {rel_path}") + + elif file1.exists() and file2.exists(): + hash1 = self.get_file_hash(file1) + hash2 = self.get_file_hash(file2) + + if hash1 != hash2: + mtime1 = file1.stat().st_mtime + mtime2 = file2.stat().st_mtime + + if mtime1 > mtime2: + if self.sync_file(self.dir1, self.dir2, rel_path): + synced_count += 1 + print(f"→ {rel_path} (更新)") + else: + if self.sync_file(self.dir2, self.dir1, rel_path): + synced_count += 1 + print(f"← {rel_path} (更新)") + + return synced_count, deleted_count + + def watch(self): + """启动文件监控""" + if not WATCHDOG_AVAILABLE: + print("错误: watchdog 模块未安装") + print("请运行: pip install watchdog") + return + + class SyncHandler(FileSystemEventHandler): + def __init__(self, syncer): + self.syncer = syncer + self.last_sync = 0 + self.sync_delay = 1 + + def on_any_event(self, event): + if event.is_directory: + return + + current_time = time.time() + if current_time - self.last_sync > self.sync_delay: + path = Path(event.src_path) + if not self.syncer.should_exclude(path): + print(f"\n检测到变化: {path.name}") + self.syncer.sync_directories() + self.last_sync = current_time + + event_handler = SyncHandler(self) + observer = Observer() + observer.schedule(event_handler, str(self.dir1), recursive=True) + observer.schedule(event_handler, str(self.dir2), recursive=True) + observer.start() + + print(f"\n监控模式已启动...") + print(f"正在监控两个目录的变化,按 Ctrl+C 退出") + print("-" * 60) + + try: + while True: + time.sleep(1) + except KeyboardInterrupt: + observer.stop() + print("\n监控已停止") + observer.join() + +def main(): + parser = argparse.ArgumentParser( + description="双向同步两个 mineru 目录", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +示例: + %(prog)s # 执行一次同步 + %(prog)s --watch # 启动实时监控模式 + %(prog)s --verbose # 显示详细信息 + %(prog)s --dry-run # 模拟运行,不实际同步 + """ + ) + + parser.add_argument( + "--watch", "-w", + action="store_true", + help="启动监控模式,实时同步文件变化" + ) + + parser.add_argument( + "--verbose", "-v", + action="store_true", + help="显示详细输出" + ) + + parser.add_argument( + "--dry-run", "-n", + action="store_true", + help="模拟运行,只显示将要执行的操作" + ) + + parser.add_argument( + "--dir1", + default="~/Documents/felo/gptbase-parser/loader/mineru", + help="第一个目录路径 (默认: ~/Documents/felo/gptbase-parser/loader/mineru)" + ) + + parser.add_argument( + "--dir2", + default="/Users/moshui/Documents/felo/moshui/MaxKB/apps/common/handle/impl/mineru", + help="第二个目录路径" + ) + + args = parser.parse_args() + + syncer = DirectorySyncer(args.dir1, args.dir2, verbose=args.verbose) + + if not syncer.dir1.exists(): + print(f"错误: 目录不存在 - {syncer.dir1}") + sys.exit(1) + + if not syncer.dir2.exists(): + print(f"错误: 目录不存在 - {syncer.dir2}") + sys.exit(1) + + if args.dry_run: + print("模拟运行模式 - 不会实际修改文件") + syncer.verbose = True + + if args.watch: + syncer.sync_directories() + syncer.watch() + else: + synced, deleted = syncer.sync_directories() + print("-" * 60) + print(f"同步完成: {synced} 个文件已同步") + if deleted > 0: + print(f" {deleted} 个文件已删除") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/ui/src/api/knowledge/document.ts b/ui/src/api/knowledge/document.ts index 54a79a44..8a90daab 100644 --- a/ui/src/api/knowledge/document.ts +++ b/ui/src/api/knowledge/document.ts @@ -361,6 +361,35 @@ const putBatchRefresh: ( ) } +/** + * 批量高级学习(重新解析) + * @param 参数 knowledge_id, + *{ + "id_list": [ + "3fa85f64-5717-4562-b3fc-2c963f66afa6" + ], + "llm_model": "model_id", + "vision_model": "model_id" +} + */ +const putBatchAdvancedLearning: ( + knowledge_id: string, + idList: string[], + models: { llmModel: string; visionModel: string }, + loading?: Ref, +) => Promise> = (knowledge_id, idList, models, loading) => { + return put( + `${prefix.value}/${knowledge_id}/document/batch_advanced_learning`, + { + id_list: idList, + llm_model: models.llmModel, + vision_model: models.visionModel, + }, + undefined, + loading, + ) +} + /** * 批量同步文档 * @param 参数 knowledge_id, @@ -582,6 +611,7 @@ export default { putBatchGenerateRelated, putBatchEditHitHandling, putBatchRefresh, + putBatchAdvancedLearning, putMulSyncDocument, putMigrateMulDocument, postQADocument, diff --git a/ui/src/locales/lang/zh-CN/common.ts b/ui/src/locales/lang/zh-CN/common.ts index 4da4ccce..d5bf3fc5 100644 --- a/ui/src/locales/lang/zh-CN/common.ts +++ b/ui/src/locales/lang/zh-CN/common.ts @@ -111,4 +111,5 @@ export default { copyTitle: '副本', professional: '购买专业版', sync: '同步', + shared: '共享', } diff --git a/ui/src/locales/lang/zh-CN/views/document.ts b/ui/src/locales/lang/zh-CN/views/document.ts index 482c7e01..4c1848e7 100644 --- a/ui/src/locales/lang/zh-CN/views/document.ts +++ b/ui/src/locales/lang/zh-CN/views/document.ts @@ -42,7 +42,7 @@ export default { fileType: { txt: { - label: '文本文件', + label: '普通学习', tip1: '1、文件上传前,建议规范文件的分段标识', }, table: { @@ -97,6 +97,7 @@ export default { name: '文件名称', char_length: '字符数', paragraph: '分段', + learningType: '学习方式', all: '全部', updateTime: '更新时间', }, @@ -109,6 +110,7 @@ export default { GENERATE: '生成中', SYNC: '同步中', REVOKE: '取消中', + PARSING: '学习中', finish: '完成', }, enableStatus: { @@ -181,4 +183,23 @@ export default { allCheck: '全选', errorMessage1: '请选择文档', }, + advancedLearning: { + title: '高级学习', + button: '高级学习', + llmModel: '大语言模型', + visionModel: '视觉模型', + selectLlmModel: '请选择大语言模型', + selectVisionModel: '请选择视觉模型', + llmModelRequired: '请选择大语言模型', + visionModelRequired: '请选择视觉模型', + tip1: '高级学习提供高质量的 PDF 和 PPT 文档解析,支持复杂表格、图片、公式等内容', + tip2: '原有段落和索引将被删除,文档将使用高级学习重新解析', + tip3: '解析过程可能需要较长时间,请耐心等待', + successMessage: '高级学习任务已启动', + loadModelsFailed: '加载模型列表失败', + }, + learningType: { + regular: '普通学习', + advanced: '高级学习', + }, } diff --git a/ui/src/utils/status.ts b/ui/src/utils/status.ts index f8f4ca7e..2f3b414c 100644 --- a/ui/src/utils/status.ts +++ b/ui/src/utils/status.ts @@ -20,6 +20,8 @@ interface StateInterface { REVOKE: '4' // 取消成功 REVOKED: '5' + // 解析中 + PARSING: '6' IGNORED: 'n' } const TaskType: TaskTypeInterface = { @@ -40,6 +42,8 @@ const State: StateInterface = { REVOKE: '4', // 取消成功 REVOKED: '5', + // 解析中 + PARSING: '6', IGNORED: 'n' } class Status { diff --git a/ui/src/views/document/component/AdvancedLearningDialog.vue b/ui/src/views/document/component/AdvancedLearningDialog.vue new file mode 100644 index 00000000..e6bc1afa --- /dev/null +++ b/ui/src/views/document/component/AdvancedLearningDialog.vue @@ -0,0 +1,185 @@ + + + + + \ No newline at end of file diff --git a/ui/src/views/document/component/Status.vue b/ui/src/views/document/component/Status.vue index db9bc177..15401095 100644 --- a/ui/src/views/document/component/Status.vue +++ b/ui/src/views/document/component/Status.vue @@ -35,6 +35,10 @@ {{ stateMap[aggStatus.value](aggStatus.key) }} + + + {{ stateMap[aggStatus.value](aggStatus.key) }} + @@ -49,6 +53,7 @@ const checkList: Array = [ State.REVOKE, State.STARTED, State.PENDING, + State.PARSING, State.FAILURE, State.REVOKED, State.SUCCESS @@ -82,6 +87,7 @@ const stateMap: any = { [State.REVOKED]: (type: number) => t('views.document.fileStatus.SUCCESS'), [State.FAILURE]: (type: number) => t('views.document.fileStatus.FAILURE'), [State.SUCCESS]: (type: number) => t('views.document.fileStatus.SUCCESS'), + [State.PARSING]: (type: number) => t('views.document.fileStatus.PARSING'), } diff --git a/ui/src/views/document/index.vue b/ui/src/views/document/index.vue index b30a6438..15b52fa5 100644 --- a/ui/src/views/document/index.vue +++ b/ui/src/views/document/index.vue @@ -44,6 +44,12 @@ v-if="permissionPrecise.doc_vector(id)" >{{ $t('views.knowledge.setting.vectorization') }} + {{ $t('views.document.advancedLearning.button') }} + + + + +