modify status parsing

This commit is contained in:
朱潮 2025-08-25 01:20:33 +08:00
parent f0263bf189
commit f1494fedea
35 changed files with 1248 additions and 129 deletions

View File

@ -26,7 +26,7 @@ from common.utils.common import post
from knowledge.models import Paragraph, Document, Problem, ProblemParagraphMapping, Knowledge from knowledge.models import Paragraph, Document, Problem, ProblemParagraphMapping, Knowledge
from knowledge.serializers.common import get_embedding_model_id_by_knowledge_id, update_document_char_length from knowledge.serializers.common import get_embedding_model_id_by_knowledge_id, update_document_char_length
from knowledge.serializers.paragraph import ParagraphSerializers from knowledge.serializers.paragraph import ParagraphSerializers
from knowledge.task.embedding import embedding_by_paragraph, embedding_by_paragraph_list from knowledge.tasks.embedding import embedding_by_paragraph, embedding_by_paragraph_list
class ChatRecordSerializerModel(serializers.ModelSerializer): class ChatRecordSerializerModel(serializers.ModelSerializer):

View File

@ -275,7 +275,7 @@ class ListenerManagement:
return return
maxkb_logger.info(_('Start--->Embedding document: {document_id}').format(document_id=document_id) maxkb_logger.info(_('Start--->Embedding document: {document_id}').format(document_id=document_id)
) )
# 批量修改状态为PADDING # 批量修改状态为STARTED
ListenerManagement.update_status(QuerySet(Document).filter(id=document_id), TaskType.EMBEDDING, ListenerManagement.update_status(QuerySet(Document).filter(id=document_id), TaskType.EMBEDDING,
State.STARTED) State.STARTED)

View File

@ -11,7 +11,7 @@ GPTBase平台适配器 - 实现GPTBase特定的功能
import contextlib import contextlib
from typing import Any, Dict from typing import Any, Dict
from .logger import logger from .logger import logger
import os
from .gptbase_utils import GPTBaseUtils, GZeroUtils from .gptbase_utils import GPTBaseUtils, GZeroUtils
from .config_gptbase import GPTBaseMinerUConfig from .config_gptbase import GPTBaseMinerUConfig
from ..base_parser import PlatformAdapter, BaseMinerUExtractor, ProcessingResult from ..base_parser import PlatformAdapter, BaseMinerUExtractor, ProcessingResult

View File

@ -242,7 +242,7 @@ class MinerUAdapter:
raise raise
def process_document(self, file_content: bytes, file_name: str, def process_document(self, file_content: bytes, file_name: str,
save_image_func=None) -> Dict[str, Any]: save_image_func=None, **kwargs) -> Dict[str, Any]:
""" """
处理文档并返回结构化内容 处理文档并返回结构化内容
@ -250,6 +250,7 @@ class MinerUAdapter:
file_content: 文件内容字节流 file_content: 文件内容字节流
file_name: 文件名 file_name: 文件名
save_image_func: 保存图片的函数 save_image_func: 保存图片的函数
**kwargs: 额外参数包括llm_model_id和vision_model_id
Returns: Returns:
包含sections的字典每个section包含contenttitle和images 包含sections的字典每个section包含contenttitle和images
@ -276,6 +277,14 @@ class MinerUAdapter:
loop = asyncio.new_event_loop() loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop) asyncio.set_event_loop(loop)
try: try:
# 提取模型ID参数
llm_model_id = kwargs.get('llm_model_id')
vision_model_id = kwargs.get('vision_model_id')
if llm_model_id and vision_model_id:
logger.info(f"使用指定模型处理文档: LLM={llm_model_id}, Vision={vision_model_id}")
# TODO: 将模型ID传递给extractor
# 目前暂时使用默认配置,后续可以在这里设置模型
result = loop.run_until_complete( result = loop.run_until_complete(
self.extractor.process_file(tmp_file_path, file_name) self.extractor.process_file(tmp_file_path, file_name)
) )

View File

@ -41,29 +41,43 @@ class MinerUSplitHandle(BaseSplitHandle):
return True return True
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, def handle(self, file, pattern_list: List, with_filter: bool, limit: int,
get_buffer, save_image): get_buffer, save_image, **kwargs):
""" """
使用MinerU处理文档 使用MinerU处理文档
""" """
try: try:
logger.info(f"MinerUSplitHandle.handle called for file: {file.name if hasattr(file, 'name') else 'unknown'}") logger.info(f"MinerUSplitHandle.handle called for file: {file.name if hasattr(file, 'name') else 'unknown'}")
# 初始化MinerU适配器 # 初始化MinerU适配器传递模型ID如果提供
if not self.mineru_adapter: if not self.mineru_adapter:
logger.info("Initializing MinerU adapter") logger.info("Initializing MinerU adapter")
llm_model_id = kwargs.get('llm_model_id')
vision_model_id = kwargs.get('vision_model_id')
if llm_model_id and vision_model_id:
logger.info(f"Using models: LLM={llm_model_id}, Vision={vision_model_id}")
self.mineru_adapter = MinerUAdapter() self.mineru_adapter = MinerUAdapter()
# 获取文件内容 # 获取文件内容
buffer = get_buffer(file) buffer = get_buffer(file)
logger.info(f"File buffer size: {len(buffer) if buffer else 0} bytes") logger.info(f"File buffer size: {len(buffer) if buffer else 0} bytes")
# 处理文档 # 处理文档传递模型ID到适配器
logger.info("Calling MinerU adapter to process document") logger.info("Calling MinerU adapter to process document")
result = self.mineru_adapter.process_document( process_kwargs = {
file_content=buffer, 'file_content': buffer,
file_name=file.name if hasattr(file, 'name') else 'document.pdf', 'file_name': file.name if hasattr(file, 'name') else 'document.pdf',
save_image_func=save_image 'save_image_func': save_image
) }
# 如果有模型ID传递给适配器
llm_model_id = kwargs.get('llm_model_id')
vision_model_id = kwargs.get('vision_model_id')
if llm_model_id:
process_kwargs['llm_model_id'] = llm_model_id
if vision_model_id:
process_kwargs['vision_model_id'] = vision_model_id
result = self.mineru_adapter.process_document(**process_kwargs)
logger.info(f"MinerU adapter returned result with {len(result.get('sections', []))} sections") logger.info(f"MinerU adapter returned result with {len(result.get('sections', []))} sections")
# 转换为段落格式 # 转换为段落格式

View File

@ -6,7 +6,8 @@ from common.result import DefaultResultSerializer
from knowledge.serializers.common import BatchSerializer from knowledge.serializers.common import BatchSerializer
from knowledge.serializers.document import DocumentInstanceSerializer, DocumentWebInstanceSerializer, \ from knowledge.serializers.document import DocumentInstanceSerializer, DocumentWebInstanceSerializer, \
CancelInstanceSerializer, BatchCancelInstanceSerializer, DocumentRefreshSerializer, BatchEditHitHandlingSerializer, \ CancelInstanceSerializer, BatchCancelInstanceSerializer, DocumentRefreshSerializer, BatchEditHitHandlingSerializer, \
DocumentBatchRefreshSerializer, DocumentBatchGenerateRelatedSerializer, DocumentMigrateSerializer DocumentBatchRefreshSerializer, DocumentBatchGenerateRelatedSerializer, DocumentMigrateSerializer, \
DocumentBatchAdvancedLearningSerializer
class DocumentSplitAPI(APIMixin): class DocumentSplitAPI(APIMixin):
@ -473,6 +474,35 @@ class DocumentExportAPI(APIMixin):
return DefaultResultSerializer return DefaultResultSerializer
class BatchAdvancedLearningAPI(APIMixin):
@staticmethod
def get_parameters():
return [
OpenApiParameter(
name="workspace_id",
description="工作空间id",
type=OpenApiTypes.STR,
location='path',
required=True,
),
OpenApiParameter(
name="knowledge_id",
description="知识库id",
type=OpenApiTypes.STR,
location='path',
required=True,
),
]
@staticmethod
def get_request():
return DocumentBatchAdvancedLearningSerializer
@staticmethod
def get_response():
return DefaultResultSerializer
class DocumentMigrateAPI(APIMixin): class DocumentMigrateAPI(APIMixin):
@staticmethod @staticmethod
def get_parameters(): def get_parameters():

View File

@ -47,6 +47,8 @@ class State(Enum):
REVOKE = '4' REVOKE = '4'
# 取消成功 # 取消成功
REVOKED = '5' REVOKED = '5'
# 解析中
PARSING = '6'
# 忽略 # 忽略
IGNORED = 'n' IGNORED = 'n'

View File

@ -45,16 +45,16 @@ from common.utils.fork import Fork
from common.utils.logger import maxkb_logger from common.utils.logger import maxkb_logger
from common.utils.split_model import get_split_model, flat_map from common.utils.split_model import get_split_model, flat_map
from knowledge.models import Knowledge, Paragraph, Problem, Document, KnowledgeType, ProblemParagraphMapping, State, \ from knowledge.models import Knowledge, Paragraph, Problem, Document, KnowledgeType, ProblemParagraphMapping, State, \
TaskType, File, FileSourceType TaskType, File, FileSourceType, get_default_status
from knowledge.serializers.common import ProblemParagraphManage, BatchSerializer, \ from knowledge.serializers.common import ProblemParagraphManage, BatchSerializer, \
get_embedding_model_id_by_knowledge_id, MetaSerializer, write_image, zip_dir get_embedding_model_id_by_knowledge_id, MetaSerializer, write_image, zip_dir
from knowledge.serializers.paragraph import ParagraphSerializers, ParagraphInstanceSerializer, \ from knowledge.serializers.paragraph import ParagraphSerializers, ParagraphInstanceSerializer, \
delete_problems_and_mappings delete_problems_and_mappings
from knowledge.task.embedding import embedding_by_document, delete_embedding_by_document_list, \ from knowledge.tasks.embedding import embedding_by_document, delete_embedding_by_document_list, \
delete_embedding_by_document, delete_embedding_by_paragraph_ids, embedding_by_document_list, \ delete_embedding_by_document, delete_embedding_by_paragraph_ids, embedding_by_document_list, \
update_embedding_knowledge_id update_embedding_knowledge_id
from knowledge.task.generate import generate_related_by_document_id from knowledge.tasks.generate import generate_related_by_document_id
from knowledge.task.sync import sync_web_document from knowledge.tasks.sync import sync_web_document
from maxkb.const import PROJECT_DIR from maxkb.const import PROJECT_DIR
from models_provider.models import Model from models_provider.models import Model
from oss.serializers.file import FileSerializer from oss.serializers.file import FileSerializer
@ -200,6 +200,12 @@ class DocumentMigrateSerializer(serializers.Serializer):
document_id_list = serializers.ListField(required=True, label=_('document id list')) document_id_list = serializers.ListField(required=True, label=_('document id list'))
class DocumentBatchAdvancedLearningSerializer(serializers.Serializer):
id_list = serializers.ListField(required=True, label=_('document id list'))
llm_model = serializers.CharField(required=True, label=_('llm model id'))
vision_model = serializers.CharField(required=True, label=_('vision model id'))
class BatchEditHitHandlingSerializer(serializers.Serializer): class BatchEditHitHandlingSerializer(serializers.Serializer):
id_list = serializers.ListField(required=True, child=serializers.UUIDField(required=True), label=_('id list')) id_list = serializers.ListField(required=True, child=serializers.UUIDField(required=True), label=_('id list'))
hit_handling_method = serializers.CharField(required=True, label=_('hit handling method')) hit_handling_method = serializers.CharField(required=True, label=_('hit handling method'))
@ -866,6 +872,11 @@ class DocumentSerializers(serializers.Serializer):
@staticmethod @staticmethod
def get_document_paragraph_model(knowledge_id, instance: Dict): def get_document_paragraph_model(knowledge_id, instance: Dict):
source_meta = {'source_file_id': instance.get('source_file_id')} if instance.get('source_file_id') else {} source_meta = {'source_file_id': instance.get('source_file_id')} if instance.get('source_file_id') else {}
# 添加MinerU模型参数到meta
if instance.get('llm_model_id'):
source_meta['llm_model_id'] = instance.get('llm_model_id')
if instance.get('vision_model_id'):
source_meta['vision_model_id'] = instance.get('vision_model_id')
meta = {**instance.get('meta'), **source_meta} if instance.get('meta') is not None else source_meta meta = {**instance.get('meta'), **source_meta} if instance.get('meta') is not None else source_meta
meta = convert_uuid_to_str(meta) meta = convert_uuid_to_str(meta)
@ -1156,99 +1167,10 @@ class DocumentSerializers(serializers.Serializer):
maxkb_logger.info(f"Processing document: {document.get('name')}, llm_model_id: {llm_model_id}, vision_model_id: {vision_model_id}") maxkb_logger.info(f"Processing document: {document.get('name')}, llm_model_id: {llm_model_id}, vision_model_id: {vision_model_id}")
if llm_model_id or vision_model_id: if llm_model_id or vision_model_id:
maxkb_logger.info(f"Document {document.get('name')} is MinerU type, processing with MinerU handler") maxkb_logger.info(f"Document {document.get('name')} is MinerU type, will process asynchronously")
source_file_id = document.get('source_file_id') # MinerU类型的文档保存基本信息不处理段落
maxkb_logger.info(f"Source file ID: {source_file_id}") # 段落处理将通过异步任务进行
if source_file_id: document['paragraphs'] = [] # 清空段落,等待异步处理
# 获取源文件
source_file = QuerySet(File).filter(id=source_file_id).first()
maxkb_logger.info(f"Source file found: {source_file is not None}")
if source_file:
try:
# 使用MinerU处理器重新解析文档
from common.handle.impl.text.mineru_split_handle import MinerUSplitHandle
from common.utils.split_model import get_split_model
import io
# 检查MinerU配置
mineru_api_type = os.environ.get('MINERU_API_TYPE', '')
if not mineru_api_type:
maxkb_logger.warning(f"MinerU API not configured, skipping MinerU processing for document: {document.get('name')}")
continue
maxkb_logger.info(f"MinerU API configured: {mineru_api_type}")
mineru_handler = MinerUSplitHandle()
# 获取文件内容
file_content = source_file.get_bytes()
temp_file = io.BytesIO(file_content)
temp_file.name = source_file.file_name
# 从现有段落中提取分段模式
# 获取用户在预览时设置的分段规则
pattern_list = []
if 'paragraphs' in document and len(document['paragraphs']) > 0:
# 尝试从元数据或其他地方获取分段模式
# 这里我们需要从前端传递分段规则
patterns = document.get('split_patterns', [])
if patterns:
pattern_list = [get_split_model(pattern) for pattern in patterns if pattern]
def get_buffer(file):
file.seek(0)
return file.read()
def save_image(image_list):
if image_list is not None and len(image_list) > 0:
exist_image_list = [str(i.get('id')) for i in
QuerySet(File).filter(id__in=[i.id for i in image_list]).values('id')]
save_image_list = [image for image in image_list if not exist_image_list.__contains__(str(image.id))]
save_image_list = list({img.id: img for img in save_image_list}.values())
for file in save_image_list:
file_bytes = file.meta.pop('content')
file.meta['knowledge_id'] = knowledge_id
file.source_type = FileSourceType.KNOWLEDGE
file.source_id = knowledge_id
file.save(file_bytes)
# 使用MinerU处理不传递is_preview参数这样MinerU会被使用
maxkb_logger.info(f"Using MinerU to process document: {document.get('name')}")
paragraphs = mineru_handler.handle(
temp_file,
pattern_list,
False, # with_filter
0, # limit (0表示不限制)
get_buffer,
save_image
)
# 如果原来有问题列表,需要保留
if 'paragraphs' in document and document['paragraphs']:
# 尝试将原有的问题列表合并到新的段落中
old_problems = {}
for old_para in document['paragraphs']:
if 'content' in old_para and 'problem_list' in old_para:
old_problems[old_para['content'][:100]] = old_para['problem_list']
# 为新段落添加问题列表(如果内容相似)
for new_para in paragraphs:
content_key = new_para.get('content', '')[:100]
if content_key in old_problems:
new_para['problem_list'] = old_problems[content_key]
# 替换文档的段落为MinerU处理后的结果
maxkb_logger.info(f"MinerU returned {len(paragraphs) if paragraphs else 0} paragraphs")
if paragraphs and len(paragraphs) > 0:
maxkb_logger.info(f"First paragraph sample: {paragraphs[0] if paragraphs else 'None'}")
document['paragraphs'] = paragraphs
else:
maxkb_logger.warning(f"MinerU returned empty paragraphs, keeping original paragraphs")
maxkb_logger.info(f"MinerU processing completed for document: {document.get('name')}, paragraphs count: {len(document.get('paragraphs', []))}")
except Exception as e:
# 如果MinerU处理失败保持原有段落
maxkb_logger.error(f"MinerU processing failed for document {document.get('name')}: {str(e)}", exc_info=True)
# 保持原有段落不变
# 插入文档 # 插入文档
for document in instance_list: for document in instance_list:
@ -1270,7 +1192,44 @@ class DocumentSerializers(serializers.Serializer):
) )
# 插入文档 # 插入文档
QuerySet(Document).bulk_create(document_model_list) if len(document_model_list) > 0 else None QuerySet(Document).bulk_create(document_model_list) if len(document_model_list) > 0 else None
# 批量插入段落
# 处理高级学习文档的异步任务
for idx, document in enumerate(instance_list):
llm_model_id = document.get('llm_model_id')
vision_model_id = document.get('vision_model_id')
if llm_model_id and vision_model_id and document_model_list:
# 找到对应的文档模型
document_model = document_model_list[idx]
maxkb_logger.info(f"Submitting async advanced learning task for document: {document_model.id}")
# 设置文档状态为解析中
ListenerManagement.update_status(
QuerySet(Document).filter(id=document_model.id),
TaskType.EMBEDDING,
State.PARSING
)
# 提交异步任务
try:
from knowledge.tasks.advanced_learning import advanced_learning_by_document
advanced_learning_by_document.delay(
str(document_model.id),
str(knowledge_id),
self.data.get('workspace_id', ''),
llm_model_id,
vision_model_id
)
maxkb_logger.info(f"Advanced learning task submitted for document {document_model.id}")
except Exception as e:
maxkb_logger.error(f"Failed to submit advanced learning task: {str(e)}")
# 如果提交失败,更新状态为失败
ListenerManagement.update_status(
QuerySet(Document).filter(id=document_model.id),
TaskType.EMBEDDING,
State.FAILURE
)
# 批量插入段落(只为非高级学习文档)
if len(paragraph_model_list) > 0: if len(paragraph_model_list) > 0:
for document in document_model_list: for document in document_model_list:
max_position = Paragraph.objects.filter(document_id=document.id).aggregate( max_position = Paragraph.objects.filter(document_id=document.id).aggregate(
@ -1439,6 +1398,79 @@ class DocumentSerializers(serializers.Serializer):
pass pass
class BatchAdvancedLearning(serializers.Serializer):
workspace_id = serializers.CharField(required=True, label=_('workspace id'))
knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id'))
def is_valid(self, *, raise_exception=False):
super().is_valid(raise_exception=True)
workspace_id = self.data.get('workspace_id')
query_set = QuerySet(Knowledge).filter(id=self.data.get('knowledge_id'))
if workspace_id:
query_set = query_set.filter(workspace_id=workspace_id)
if not query_set.exists():
raise AppApiException(500, _('Knowledge id does not exist'))
def batch_advanced_learning(self, instance: Dict, with_valid=True):
if with_valid:
self.is_valid(raise_exception=True)
document_id_list = instance.get("id_list", [])
llm_model_id = instance.get("llm_model")
vision_model_id = instance.get("vision_model")
if not document_id_list:
raise AppApiException(500, _('Document list is empty'))
if not llm_model_id or not vision_model_id:
raise AppApiException(500, _('Model selection is required'))
knowledge_id = self.data.get('knowledge_id')
workspace_id = self.data.get('workspace_id')
# 获取知识库
knowledge = QuerySet(Knowledge).filter(id=knowledge_id).first()
if not knowledge:
raise AppApiException(500, _('Knowledge not found'))
# 检查MinerU配置
import os
mineru_api_type = os.environ.get('MINERU_API_TYPE', '')
if not mineru_api_type:
raise AppApiException(500, _('MinerU API not configured'))
# 更新文档状态为解析中(而不是排队中)
for document_id in document_id_list:
ListenerManagement.update_status(
QuerySet(Document).filter(id=document_id),
TaskType.EMBEDDING,
State.PARSING
)
# 调用异步任务处理文档
try:
from knowledge.tasks.advanced_learning import batch_advanced_learning
batch_advanced_learning.delay(
document_id_list,
str(knowledge_id),
workspace_id,
llm_model_id,
vision_model_id
)
maxkb_logger.info(f"Submitted advanced learning tasks for {len(document_id_list)} documents")
except Exception as e:
maxkb_logger.error(f"Failed to submit advanced learning tasks: {str(e)}")
# 如果提交任务失败,更新状态为失败
for document_id in document_id_list:
ListenerManagement.update_status(
QuerySet(Document).filter(id=document_id),
TaskType.EMBEDDING,
State.FAILURE
)
raise AppApiException(500, _('Failed to submit advanced learning tasks'))
return True
class FileBufferHandle: class FileBufferHandle:
buffer = None buffer = None

View File

@ -33,9 +33,9 @@ from knowledge.models import Knowledge, KnowledgeScope, KnowledgeType, Document,
from knowledge.serializers.common import ProblemParagraphManage, drop_knowledge_index, get_embedding_model_id_by_knowledge_id, MetaSerializer, \ from knowledge.serializers.common import ProblemParagraphManage, drop_knowledge_index, get_embedding_model_id_by_knowledge_id, MetaSerializer, \
GenerateRelatedSerializer, get_embedding_model_by_knowledge_id, list_paragraph, write_image, zip_dir GenerateRelatedSerializer, get_embedding_model_by_knowledge_id, list_paragraph, write_image, zip_dir
from knowledge.serializers.document import DocumentSerializers from knowledge.serializers.document import DocumentSerializers
from knowledge.task.embedding import embedding_by_knowledge, delete_embedding_by_knowledge from knowledge.tasks.embedding import embedding_by_knowledge, delete_embedding_by_knowledge
from knowledge.task.generate import generate_related_by_knowledge_id from knowledge.tasks.generate import generate_related_by_knowledge_id
from knowledge.task.sync import sync_web_knowledge, sync_replace_web_knowledge from knowledge.tasks.sync import sync_web_knowledge, sync_replace_web_knowledge
from maxkb.conf import PROJECT_DIR from maxkb.conf import PROJECT_DIR
from models_provider.models import Model from models_provider.models import Model
from system_manage.models import WorkspaceUserResourcePermission, AuthTargetType from system_manage.models import WorkspaceUserResourcePermission, AuthTargetType

View File

@ -18,11 +18,11 @@ from knowledge.models import Paragraph, Problem, Document, ProblemParagraphMappi
from knowledge.serializers.common import ProblemParagraphObject, ProblemParagraphManage, \ from knowledge.serializers.common import ProblemParagraphObject, ProblemParagraphManage, \
get_embedding_model_id_by_knowledge_id, update_document_char_length, BatchSerializer get_embedding_model_id_by_knowledge_id, update_document_char_length, BatchSerializer
from knowledge.serializers.problem import ProblemInstanceSerializer, ProblemSerializer, ProblemSerializers from knowledge.serializers.problem import ProblemInstanceSerializer, ProblemSerializer, ProblemSerializers
from knowledge.task.embedding import embedding_by_paragraph, enable_embedding_by_paragraph, \ from knowledge.tasks.embedding import embedding_by_paragraph, enable_embedding_by_paragraph, \
disable_embedding_by_paragraph, \ disable_embedding_by_paragraph, \
delete_embedding_by_paragraph, embedding_by_problem as embedding_by_problem_task, delete_embedding_by_paragraph_ids, \ delete_embedding_by_paragraph, embedding_by_problem as embedding_by_problem_task, delete_embedding_by_paragraph_ids, \
embedding_by_problem, delete_embedding_by_source, update_embedding_document_id embedding_by_problem, delete_embedding_by_source, update_embedding_document_id
from knowledge.task.generate import generate_related_by_paragraph_id_list from knowledge.tasks.generate import generate_related_by_paragraph_id_list
class ParagraphSerializer(serializers.ModelSerializer): class ParagraphSerializer(serializers.ModelSerializer):

View File

@ -13,7 +13,7 @@ from common.exception.app_exception import AppApiException
from common.utils.common import get_file_content from common.utils.common import get_file_content
from knowledge.models import Problem, ProblemParagraphMapping, Paragraph, Knowledge, SourceType from knowledge.models import Problem, ProblemParagraphMapping, Paragraph, Knowledge, SourceType
from knowledge.serializers.common import get_embedding_model_id_by_knowledge_id from knowledge.serializers.common import get_embedding_model_id_by_knowledge_id
from knowledge.task.embedding import delete_embedding_by_source_ids, update_problem_embedding, embedding_by_data_list from knowledge.tasks.embedding import delete_embedding_by_source_ids, update_problem_embedding, embedding_by_data_list
from maxkb.const import PROJECT_DIR from maxkb.const import PROJECT_DIR

View File

@ -3,7 +3,11 @@ SELECT
"document".* , "document".* ,
to_json("document"."meta") as meta, to_json("document"."meta") as meta,
to_json("document"."status_meta") as status_meta, to_json("document"."status_meta") as status_meta,
(SELECT "count"("id") FROM "paragraph" WHERE document_id="document"."id") as "paragraph_count" (SELECT "count"("id") FROM "paragraph" WHERE document_id="document"."id") as "paragraph_count",
CASE
WHEN "document"."meta"->>'llm_model_id' IS NOT NULL THEN 'advanced'
ELSE 'regular'
END as "learning_type"
FROM FROM
"document" "document" "document" "document"
${document_custom_sql} ${document_custom_sql}

View File

@ -0,0 +1,5 @@
# coding=utf-8
# Import tasks for Celery discovery
# Note: We import the specific tasks, not * to avoid circular imports
from .advanced_learning import advanced_learning_by_document, batch_advanced_learning

View File

@ -0,0 +1,208 @@
# coding=utf-8
"""
高级学习任务 - 使用MinerU重新解析文档
"""
import traceback
import uuid as uuid_lib
from typing import List
from celery_once import QueueOnce
from django.db.models import QuerySet
from ops import celery_app
@celery_app.task(name='celery:advanced_learning_by_document')
def advanced_learning_by_document(document_id: str, knowledge_id: str, workspace_id: str,
llm_model_id: str, vision_model_id: str):
"""
使用MinerU高级学习处理文档
@param document_id: 文档ID
@param knowledge_id: 知识库ID
@param workspace_id: 工作空间ID
@param llm_model_id: 大语言模型ID
@param vision_model_id: 视觉模型ID
"""
# 延迟导入,避免循环依赖
from common.event import ListenerManagement
from common.utils.logger import maxkb_logger
from knowledge.models import Document, Knowledge, Paragraph, State, TaskType, File, FileSourceType, get_default_status
from knowledge.serializers.paragraph import delete_problems_and_mappings
from knowledge.tasks.embedding import delete_embedding_by_document, embedding_by_document
maxkb_logger.info(f"Starting advanced learning for document {document_id}")
try:
# 立即更新状态为解析中
from common.event import ListenerManagement
ListenerManagement.update_status(
QuerySet(Document).filter(id=document_id),
TaskType.EMBEDDING,
State.PARSING
)
maxkb_logger.info(f"Updated document {document_id} status to PARSING")
# 获取文档
document = QuerySet(Document).filter(id=document_id).first()
if not document:
maxkb_logger.error(f"Document {document_id} not found")
return
# 获取知识库
knowledge = QuerySet(Knowledge).filter(id=knowledge_id).first()
if not knowledge:
maxkb_logger.error(f"Knowledge {knowledge_id} not found")
return
# 获取源文件
source_file_id = document.meta.get('source_file_id')
if not source_file_id:
maxkb_logger.warning(f"No source file for document {document.id}")
ListenerManagement.update_status(
QuerySet(Document).filter(id=document_id),
TaskType.EMBEDDING,
State.FAILURE
)
return
source_file = QuerySet(File).filter(id=source_file_id).first()
if not source_file:
maxkb_logger.warning(f"Source file not found for document {document.id}")
ListenerManagement.update_status(
QuerySet(Document).filter(id=document_id),
TaskType.EMBEDDING,
State.FAILURE
)
return
# 删除现有的段落和向量数据
QuerySet(Paragraph).filter(document_id=document_id).delete()
delete_problems_and_mappings([document_id])
delete_embedding_by_document(document_id)
# 更新文档元数据,记录使用的模型
document.meta['llm_model_id'] = llm_model_id
document.meta['vision_model_id'] = vision_model_id
document.save()
# 使用MinerU重新解析文档
from common.handle.impl.text.mineru_split_handle import MinerUSplitHandle
import io
mineru_handler = MinerUSplitHandle()
# 获取文件内容
file_content = source_file.get_bytes()
temp_file = io.BytesIO(file_content)
temp_file.name = source_file.file_name
def get_buffer(file):
file.seek(0)
return file.read()
def save_image(image_list):
if image_list is not None and len(image_list) > 0:
exist_image_list = [str(i.get('id')) for i in
QuerySet(File).filter(id__in=[i.id for i in image_list]).values('id')]
save_image_list = [image for image in image_list if not exist_image_list.__contains__(str(image.id))]
save_image_list = list({img.id: img for img in save_image_list}.values())
for file in save_image_list:
file_bytes = file.meta.pop('content')
file.meta['knowledge_id'] = knowledge_id
file.source_type = FileSourceType.KNOWLEDGE
file.source_id = knowledge_id
file.save(file_bytes)
# 使用MinerU处理文档
maxkb_logger.info(f"Using MinerU to reprocess document: {document.name}")
paragraphs = mineru_handler.handle(
temp_file,
[], # pattern_list
False, # with_filter
0, # limit (0表示不限制)
get_buffer,
save_image,
llm_model_id=llm_model_id,
vision_model_id=vision_model_id
)
if paragraphs and len(paragraphs) > 0:
# 创建新的段落
paragraph_model_list = []
for index, paragraph in enumerate(paragraphs):
paragraph_instance = Paragraph(
id=uuid_lib.uuid4(),
document_id=document_id,
knowledge_id=knowledge_id,
content=paragraph.get('content', ''),
title=paragraph.get('title', ''),
status=get_default_status(),
is_active=True,
hit_num=0,
position=index + 1
)
if 'image_list' in paragraph:
paragraph_instance.image_list = paragraph['image_list']
paragraph_model_list.append(paragraph_instance)
# 批量插入段落
QuerySet(Paragraph).bulk_create(paragraph_model_list)
# 更新文档字符数
char_length = sum([len(p.content) for p in paragraph_model_list])
document.char_length = char_length
document.save()
# MinerU解析完成启动向量化任务
embedding_model_id = knowledge.embedding_model_id
maxkb_logger.info(f"Starting embedding for document {document_id} after MinerU parsing")
# 调用向量化任务此时embedding_by_document会自动将状态从PARSING更新为STARTED
embedding_by_document.delay(
str(document_id),
str(embedding_model_id)
)
maxkb_logger.info(f"MinerU reprocessing completed for document: {document.name}, "
f"created {len(paragraph_model_list)} paragraphs")
else:
maxkb_logger.warning(f"MinerU returned no paragraphs for document: {document.name}")
# 更新状态为失败
ListenerManagement.update_status(
QuerySet(Document).filter(id=document_id),
TaskType.EMBEDDING,
State.FAILURE
)
except Exception as e:
maxkb_logger.error(f"Failed to process document {document_id}: {str(e)}", exc_info=True)
# 更新状态为失败
ListenerManagement.update_status(
QuerySet(Document).filter(id=document_id),
TaskType.EMBEDDING,
State.FAILURE
)
@celery_app.task(name='celery:batch_advanced_learning')
def batch_advanced_learning(document_id_list: List[str], knowledge_id: str, workspace_id: str,
llm_model_id: str, vision_model_id: str):
"""
批量高级学习任务
@param document_id_list: 文档ID列表
@param knowledge_id: 知识库ID
@param workspace_id: 工作空间ID
@param llm_model_id: 大语言模型ID
@param vision_model_id: 视觉模型ID
"""
from common.utils.logger import maxkb_logger
maxkb_logger.info(f"batch_advanced_learning called with {len(document_id_list)} documents")
for document_id in document_id_list:
maxkb_logger.info(f"Submitting advanced_learning_by_document for document {document_id}")
advanced_learning_by_document.apply_async(
args=[str(document_id), str(knowledge_id), workspace_id, llm_model_id, vision_model_id],
queue='celery'
)

View File

@ -12,7 +12,7 @@ from common.event import ListenerManagement
from common.utils.logger import maxkb_logger from common.utils.logger import maxkb_logger
from common.utils.page_utils import page, page_desc from common.utils.page_utils import page, page_desc
from knowledge.models import Paragraph, Document, Status, TaskType, State from knowledge.models import Paragraph, Document, Status, TaskType, State
from knowledge.task.handler import save_problem from knowledge.tasks.handler import save_problem
from models_provider.models import Model from models_provider.models import Model
from models_provider.tools import get_model from models_provider.tools import get_model
from ops import celery_app from ops import celery_app

View File

@ -23,7 +23,7 @@ from ops import celery_app
@celery_app.task(base=QueueOnce, once={'keys': ['knowledge_id']}, name='celery:sync_web_knowledge') @celery_app.task(base=QueueOnce, once={'keys': ['knowledge_id']}, name='celery:sync_web_knowledge')
def sync_web_knowledge(knowledge_id: str, url: str, selector: str): def sync_web_knowledge(knowledge_id: str, url: str, selector: str):
from knowledge.task.handler import get_save_handler from knowledge.tasks.handler import get_save_handler
try: try:
maxkb_logger.info( maxkb_logger.info(
@ -40,7 +40,7 @@ def sync_web_knowledge(knowledge_id: str, url: str, selector: str):
@celery_app.task(base=QueueOnce, once={'keys': ['knowledge_id']}, name='celery:sync_replace_web_knowledge') @celery_app.task(base=QueueOnce, once={'keys': ['knowledge_id']}, name='celery:sync_replace_web_knowledge')
def sync_replace_web_knowledge(knowledge_id: str, url: str, selector: str): def sync_replace_web_knowledge(knowledge_id: str, url: str, selector: str):
from knowledge.task.handler import get_sync_handler from knowledge.tasks.handler import get_sync_handler
try: try:
maxkb_logger.info( maxkb_logger.info(
@ -56,7 +56,7 @@ def sync_replace_web_knowledge(knowledge_id: str, url: str, selector: str):
@celery_app.task(name='celery:sync_web_document') @celery_app.task(name='celery:sync_web_document')
def sync_web_document(knowledge_id, source_url_list: List[str], selector: str): def sync_web_document(knowledge_id, source_url_list: List[str], selector: str):
from knowledge.task.handler import get_sync_web_document_handler from knowledge.tasks.handler import get_sync_web_document_handler
handler = get_sync_web_document_handler(knowledge_id) handler = get_sync_web_document_handler(knowledge_id)
for source_url in source_url_list: for source_url in source_url_list:

View File

@ -26,6 +26,7 @@ urlpatterns = [
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_sync', views.DocumentView.BatchSync.as_view()), path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_sync', views.DocumentView.BatchSync.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_delete', views.DocumentView.BatchDelete.as_view()), path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_delete', views.DocumentView.BatchDelete.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_refresh', views.DocumentView.BatchRefresh.as_view()), path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_refresh', views.DocumentView.BatchRefresh.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_advanced_learning', views.BatchAdvancedLearning.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_generate_related', views.DocumentView.BatchGenerateRelated.as_view()), path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_generate_related', views.DocumentView.BatchGenerateRelated.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/web', views.WebDocumentView.as_view()), path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/web', views.WebDocumentView.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/qa', views.QaDocumentView.as_view()), path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/qa', views.QaDocumentView.as_view()),

View File

@ -13,7 +13,7 @@ from knowledge.api.document import DocumentSplitAPI, DocumentBatchAPI, DocumentB
DocumentReadAPI, DocumentEditAPI, DocumentDeleteAPI, TableDocumentCreateAPI, QaDocumentCreateAPI, \ DocumentReadAPI, DocumentEditAPI, DocumentDeleteAPI, TableDocumentCreateAPI, QaDocumentCreateAPI, \
WebDocumentCreateAPI, CancelTaskAPI, BatchCancelTaskAPI, SyncWebAPI, RefreshAPI, BatchEditHitHandlingAPI, \ WebDocumentCreateAPI, CancelTaskAPI, BatchCancelTaskAPI, SyncWebAPI, RefreshAPI, BatchEditHitHandlingAPI, \
DocumentTreeReadAPI, DocumentSplitPatternAPI, BatchRefreshAPI, BatchGenerateRelatedAPI, TemplateExportAPI, \ DocumentTreeReadAPI, DocumentSplitPatternAPI, BatchRefreshAPI, BatchGenerateRelatedAPI, TemplateExportAPI, \
DocumentExportAPI, DocumentMigrateAPI, DocumentDownloadSourceAPI DocumentExportAPI, DocumentMigrateAPI, DocumentDownloadSourceAPI, BatchAdvancedLearningAPI
from knowledge.serializers.common import get_knowledge_operation_object from knowledge.serializers.common import get_knowledge_operation_object
from knowledge.serializers.document import DocumentSerializers from knowledge.serializers.document import DocumentSerializers
from knowledge.views.common import get_knowledge_document_operation_object, get_document_operation_object_batch, \ from knowledge.views.common import get_knowledge_document_operation_object, get_document_operation_object_batch, \
@ -817,3 +817,39 @@ class TableTemplate(APIView):
tags=[_('Knowledge Base/Documentation')]) # type: ignore tags=[_('Knowledge Base/Documentation')]) # type: ignore
def get(self, request: Request): def get(self, request: Request):
return DocumentSerializers.Export(data={'type': request.query_params.get('type')}).table_export(with_valid=True) return DocumentSerializers.Export(data={'type': request.query_params.get('type')}).table_export(with_valid=True)
class BatchAdvancedLearning(APIView):
authentication_classes = [TokenAuth]
@extend_schema(
methods=['PUT'],
summary=_('Batch advanced learning with MinerU'),
operation_id=_('Batch advanced learning with MinerU'), # type: ignore
request=BatchAdvancedLearningAPI.get_request(),
parameters=BatchAdvancedLearningAPI.get_parameters(),
responses=BatchAdvancedLearningAPI.get_response(),
tags=[_('Knowledge Base/Documentation')] # type: ignore
)
@has_permissions(
PermissionConstants.KNOWLEDGE_DOCUMENT_VECTOR.get_workspace_knowledge_permission(),
PermissionConstants.KNOWLEDGE_DOCUMENT_VECTOR.get_workspace_permission_workspace_manage_role(),
PermissionConstants.KNOWLEDGE_DOCUMENT_EDIT.get_workspace_knowledge_permission(),
PermissionConstants.KNOWLEDGE_DOCUMENT_EDIT.get_workspace_permission_workspace_manage_role(),
RoleConstants.WORKSPACE_MANAGE.get_workspace_role(),
ViewPermission([RoleConstants.USER.get_workspace_role()],
[PermissionConstants.KNOWLEDGE.get_workspace_knowledge_permission()], CompareConstants.AND),
)
@log(
menu='document', operate="Batch advanced learning",
get_operation_object=lambda r, keywords: get_knowledge_document_operation_object(
get_knowledge_operation_object(keywords.get('knowledge_id')),
get_document_operation_object_batch(r.data.get('id_list')),
),
)
def put(self, request: Request, workspace_id: str, knowledge_id: str):
return result.success(
DocumentSerializers.BatchAdvancedLearning(
data={'workspace_id': workspace_id, 'knowledge_id': knowledge_id}
).batch_advanced_learning(request.data)
)

View File

@ -0,0 +1,79 @@
# coding=utf-8
"""
@project: MaxKB
@AuthorMaxKB Team
@file image.py
@date2024/12/20
@desc: Kimi Image/Vision Model Credential
"""
import traceback
from typing import Dict
from django.utils.translation import gettext_lazy as _, gettext
from langchain_core.messages import HumanMessage
from common import forms
from common.exception.app_exception import AppApiException
from common.forms import BaseForm, TooltipLabel
from models_provider.base_model_provider import BaseModelCredential, ValidCode
class KimiImageModelParams(BaseForm):
temperature = forms.SliderField(TooltipLabel(_('Temperature'),
_('Higher values make the output more random, while lower values make it more focused and deterministic')),
required=True, default_value=0.3,
_min=0.1,
_max=1.0,
_step=0.01,
precision=2)
max_tokens = forms.SliderField(
TooltipLabel(_('Output the maximum Tokens'),
_('Specify the maximum number of tokens that the model can generate')),
required=True, default_value=1024,
_min=1,
_max=100000,
_step=1,
precision=0)
class KimiImageModelCredential(BaseForm, BaseModelCredential):
api_base = forms.TextInputField('API URL', required=True)
api_key = forms.PasswordInputField('API Key', required=True)
def is_valid(self, model_type: str, model_name, model_credential: Dict[str, object], model_params, provider,
raise_exception=False):
model_type_list = provider.get_model_type_list()
if not any(list(filter(lambda mt: mt.get('value') == model_type, model_type_list))):
raise AppApiException(ValidCode.valid_error.value,
gettext('{model_type} Model type is not supported').format(model_type=model_type))
for key in ['api_base', 'api_key']:
if key not in model_credential:
if raise_exception:
raise AppApiException(ValidCode.valid_error.value, gettext('{key} is required').format(key=key))
else:
return False
try:
model = provider.get_model(model_type, model_name, model_credential, **model_params)
res = model.stream([HumanMessage(content=[{"type": "text", "text": gettext('Hello')}])])
for chunk in res:
pass # Just consume the stream to validate
except Exception as e:
traceback.print_exc()
if isinstance(e, AppApiException):
raise e
if raise_exception:
raise AppApiException(ValidCode.valid_error.value,
gettext(
'Verification failed, please check whether the parameters are correct: {error}').format(
error=str(e)))
else:
return False
return True
def encryption_dict(self, model: Dict[str, object]):
return {**model, 'api_key': super().encryption(model.get('api_key', ''))}
def get_model_params_setting_form(self, model_name):
return KimiImageModelParams()

View File

@ -12,11 +12,15 @@ from common.utils.common import get_file_content
from models_provider.base_model_provider import IModelProvider, ModelProvideInfo, ModelInfo, \ from models_provider.base_model_provider import IModelProvider, ModelProvideInfo, ModelInfo, \
ModelTypeConst, ModelInfoManage ModelTypeConst, ModelInfoManage
from models_provider.impl.kimi_model_provider.credential.llm import KimiLLMModelCredential from models_provider.impl.kimi_model_provider.credential.llm import KimiLLMModelCredential
from models_provider.impl.kimi_model_provider.credential.image import KimiImageModelCredential
from models_provider.impl.kimi_model_provider.model.llm import KimiChatModel from models_provider.impl.kimi_model_provider.model.llm import KimiChatModel
from models_provider.impl.kimi_model_provider.model.image import KimiImageModel
from maxkb.conf import PROJECT_DIR from maxkb.conf import PROJECT_DIR
kimi_llm_model_credential = KimiLLMModelCredential() kimi_llm_model_credential = KimiLLMModelCredential()
kimi_image_model_credential = KimiImageModelCredential()
# LLM Models
moonshot_v1_8k = ModelInfo('moonshot-v1-8k', '', ModelTypeConst.LLM, kimi_llm_model_credential, moonshot_v1_8k = ModelInfo('moonshot-v1-8k', '', ModelTypeConst.LLM, kimi_llm_model_credential,
KimiChatModel) KimiChatModel)
moonshot_v1_32k = ModelInfo('moonshot-v1-32k', '', ModelTypeConst.LLM, kimi_llm_model_credential, moonshot_v1_32k = ModelInfo('moonshot-v1-32k', '', ModelTypeConst.LLM, kimi_llm_model_credential,
@ -24,8 +28,24 @@ moonshot_v1_32k = ModelInfo('moonshot-v1-32k', '', ModelTypeConst.LLM, kimi_llm_
moonshot_v1_128k = ModelInfo('moonshot-v1-128k', '', ModelTypeConst.LLM, kimi_llm_model_credential, moonshot_v1_128k = ModelInfo('moonshot-v1-128k', '', ModelTypeConst.LLM, kimi_llm_model_credential,
KimiChatModel) KimiChatModel)
model_info_manage = ModelInfoManage.builder().append_model_info(moonshot_v1_8k).append_model_info( # Vision/Image Models
moonshot_v1_32k).append_default_model_info(moonshot_v1_128k).append_default_model_info(moonshot_v1_8k).build() moonshot_v1_8k_vision = ModelInfo('moonshot-v1-8k-vision-preview', 'Kimi 视觉模型 8K', ModelTypeConst.IMAGE,
kimi_image_model_credential, KimiImageModel)
moonshot_v1_32k_vision = ModelInfo('moonshot-v1-32k-vision-preview', 'Kimi 视觉模型 32K', ModelTypeConst.IMAGE,
kimi_image_model_credential, KimiImageModel)
moonshot_v1_128k_vision = ModelInfo('moonshot-v1-128k-vision-preview', 'Kimi 视觉模型 128K', ModelTypeConst.IMAGE,
kimi_image_model_credential, KimiImageModel)
model_info_manage = (ModelInfoManage.builder()
.append_model_info(moonshot_v1_8k)
.append_model_info(moonshot_v1_32k)
.append_default_model_info(moonshot_v1_128k)
.append_default_model_info(moonshot_v1_8k)
.append_model_info(moonshot_v1_8k_vision)
.append_model_info(moonshot_v1_32k_vision)
.append_model_info(moonshot_v1_128k_vision)
.append_default_model_info(moonshot_v1_128k_vision)
.build())
class KimiModelProvider(IModelProvider): class KimiModelProvider(IModelProvider):

View File

@ -0,0 +1,32 @@
# coding=utf-8
"""
@project: MaxKB
@AuthorMaxKB Team
@file image.py
@date2024/12/20
@desc: Kimi Image/Vision Model Implementation
"""
from typing import Dict
from models_provider.base_model_provider import MaxKBBaseModel
from models_provider.impl.base_chat_open_ai import BaseChatOpenAI
class KimiImageModel(MaxKBBaseModel, BaseChatOpenAI):
@staticmethod
def is_cache_model():
return False
@staticmethod
def new_instance(model_type, model_name, model_credential: Dict[str, object], **model_kwargs):
optional_params = MaxKBBaseModel.filter_optional_params(model_kwargs)
return KimiImageModel(
model_name=model_name,
openai_api_base=model_credential.get('api_base'),
openai_api_key=model_credential.get('api_key'),
streaming=True,
stream_usage=True,
extra_body=optional_params
)

View File

@ -7,3 +7,15 @@
@desc: @desc:
""" """
from .celery import app as celery_app from .celery import app as celery_app
# Import and register advanced learning tasks
try:
from knowledge.tasks.advanced_learning import (
advanced_learning_by_document,
batch_advanced_learning
)
# Register tasks with the celery app
celery_app.register_task(advanced_learning_by_document)
celery_app.register_task(batch_advanced_learning)
except ImportError:
pass

67
dev/Makefile Normal file
View File

@ -0,0 +1,67 @@
# MaxKB 开发环境快捷命令
# 快速重启Python进程最常用
restart:
@echo "🔄 重启MaxKB Python进程..."
@docker exec maxkb-dev pkill -f "python.*main.py" 2>/dev/null || true
@sleep 2
@echo "✅ 重启完成"
# 查看实时日志
logs:
docker logs -f maxkb-dev
# 查看最新20行日志
log:
docker logs maxkb-dev --tail 20
# 进入容器shell
shell:
docker exec -it maxkb-dev bash
# 查看Python进程
ps:
@docker exec maxkb-dev ps aux | grep python | grep -v grep || echo "没有找到Python进程"
# 完全重启容器
full-restart:
docker compose -f docker-compose-simple.yml restart
# 停止容器
stop:
docker compose -f docker-compose-simple.yml stop
# 启动容器
start:
docker compose -f docker-compose-simple.yml start
# 重新构建并启动
rebuild:
docker compose -f docker-compose-simple.yml down
docker compose -f docker-compose-simple.yml up -d --build
# 查看容器状态
status:
@docker ps --filter name=maxkb-dev --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
# 清理日志
clean-logs:
@docker exec maxkb-dev bash -c "find /opt/maxkb-app -name '*.log' -type f -exec truncate -s 0 {} \;" 2>/dev/null || true
@echo "日志已清理"
# 帮助信息
help:
@echo "MaxKB 开发环境快捷命令:"
@echo " make restart - 快速重启Python进程最快"
@echo " make logs - 查看实时日志"
@echo " make log - 查看最新20行日志"
@echo " make shell - 进入容器Shell"
@echo " make ps - 查看Python进程"
@echo " make full-restart - 完全重启容器"
@echo " make stop - 停止容器"
@echo " make start - 启动容器"
@echo " make rebuild - 重新构建并启动"
@echo " make status - 查看容器状态"
@echo " make clean-logs - 清理日志文件"
.DEFAULT_GOAL := help

281
dev/sync_mineru_dirs.py Executable file
View File

@ -0,0 +1,281 @@
#!/usr/bin/env python3
"""
双向同步脚本在两个 mineru 目录之间进行文件同步
目录1: ~/Documents/felo/gptbase-parser/loader/mineru
目录2: /Users/moshui/Documents/felo/moshui/MaxKB/apps/common/handle/impl/mineru
"""
import os
import sys
import time
import shutil
import hashlib
import argparse
from pathlib import Path
from datetime import datetime
from typing import Set, Tuple, Optional
import subprocess
try:
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
WATCHDOG_AVAILABLE = True
except ImportError:
WATCHDOG_AVAILABLE = False
print("警告: watchdog 未安装,实时监控功能不可用")
print("运行 'pip install watchdog' 来启用实时监控功能")
class DirectorySyncer:
def __init__(self, dir1: str, dir2: str, verbose: bool = False):
self.dir1 = Path(dir1).expanduser().resolve()
self.dir2 = Path(dir2).expanduser().resolve()
self.verbose = verbose
self.exclude_patterns = {
'__pycache__',
'.DS_Store',
'*.pyc',
'*.pyo',
'.git',
'.idea',
'.vscode',
'*.swp',
'*.swo',
'*~'
}
def should_exclude(self, path: Path) -> bool:
"""检查文件或目录是否应该被排除"""
name = path.name
for pattern in self.exclude_patterns:
if pattern.startswith('*'):
if name.endswith(pattern[1:]):
return True
elif pattern.endswith('*'):
if name.startswith(pattern[:-1]):
return True
elif name == pattern:
return True
return False
def get_file_hash(self, filepath: Path) -> str:
"""计算文件的 MD5 哈希值"""
hash_md5 = hashlib.md5()
try:
with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
except Exception as e:
if self.verbose:
print(f"无法计算 {filepath} 的哈希值: {e}")
return ""
def get_relative_files(self, directory: Path) -> Set[Path]:
"""获取目录中所有文件的相对路径集合"""
files = set()
for item in directory.rglob("*"):
if self.should_exclude(item):
continue
if item.is_file():
relative_path = item.relative_to(directory)
files.add(relative_path)
return files
def sync_file(self, source: Path, dest: Path, relative_path: Path) -> bool:
"""同步单个文件"""
source_file = source / relative_path
dest_file = dest / relative_path
try:
dest_file.parent.mkdir(parents=True, exist_ok=True)
if not dest_file.exists():
shutil.copy2(source_file, dest_file)
if self.verbose:
print(f"复制: {relative_path}")
return True
else:
source_hash = self.get_file_hash(source_file)
dest_hash = self.get_file_hash(dest_file)
if source_hash != dest_hash:
source_mtime = source_file.stat().st_mtime
dest_mtime = dest_file.stat().st_mtime
if source_mtime > dest_mtime:
shutil.copy2(source_file, dest_file)
if self.verbose:
print(f"更新: {relative_path} (源文件较新)")
return True
elif self.verbose:
print(f"跳过: {relative_path} (目标文件较新或相同)")
except Exception as e:
print(f"错误同步 {relative_path}: {e}")
return False
return False
def sync_directories(self) -> Tuple[int, int]:
"""执行双向同步"""
print(f"\n开始同步...")
print(f"目录1: {self.dir1}")
print(f"目录2: {self.dir2}")
print("-" * 60)
files1 = self.get_relative_files(self.dir1)
files2 = self.get_relative_files(self.dir2)
all_files = files1 | files2
synced_count = 0
deleted_count = 0
for rel_path in all_files:
file1 = self.dir1 / rel_path
file2 = self.dir2 / rel_path
if file1.exists() and not file2.exists():
if self.sync_file(self.dir1, self.dir2, rel_path):
synced_count += 1
print(f"{rel_path}")
elif file2.exists() and not file1.exists():
if self.sync_file(self.dir2, self.dir1, rel_path):
synced_count += 1
print(f"{rel_path}")
elif file1.exists() and file2.exists():
hash1 = self.get_file_hash(file1)
hash2 = self.get_file_hash(file2)
if hash1 != hash2:
mtime1 = file1.stat().st_mtime
mtime2 = file2.stat().st_mtime
if mtime1 > mtime2:
if self.sync_file(self.dir1, self.dir2, rel_path):
synced_count += 1
print(f"{rel_path} (更新)")
else:
if self.sync_file(self.dir2, self.dir1, rel_path):
synced_count += 1
print(f"{rel_path} (更新)")
return synced_count, deleted_count
def watch(self):
"""启动文件监控"""
if not WATCHDOG_AVAILABLE:
print("错误: watchdog 模块未安装")
print("请运行: pip install watchdog")
return
class SyncHandler(FileSystemEventHandler):
def __init__(self, syncer):
self.syncer = syncer
self.last_sync = 0
self.sync_delay = 1
def on_any_event(self, event):
if event.is_directory:
return
current_time = time.time()
if current_time - self.last_sync > self.sync_delay:
path = Path(event.src_path)
if not self.syncer.should_exclude(path):
print(f"\n检测到变化: {path.name}")
self.syncer.sync_directories()
self.last_sync = current_time
event_handler = SyncHandler(self)
observer = Observer()
observer.schedule(event_handler, str(self.dir1), recursive=True)
observer.schedule(event_handler, str(self.dir2), recursive=True)
observer.start()
print(f"\n监控模式已启动...")
print(f"正在监控两个目录的变化,按 Ctrl+C 退出")
print("-" * 60)
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
print("\n监控已停止")
observer.join()
def main():
parser = argparse.ArgumentParser(
description="双向同步两个 mineru 目录",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
示例:
%(prog)s # 执行一次同步
%(prog)s --watch # 启动实时监控模式
%(prog)s --verbose # 显示详细信息
%(prog)s --dry-run # 模拟运行,不实际同步
"""
)
parser.add_argument(
"--watch", "-w",
action="store_true",
help="启动监控模式,实时同步文件变化"
)
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="显示详细输出"
)
parser.add_argument(
"--dry-run", "-n",
action="store_true",
help="模拟运行,只显示将要执行的操作"
)
parser.add_argument(
"--dir1",
default="~/Documents/felo/gptbase-parser/loader/mineru",
help="第一个目录路径 (默认: ~/Documents/felo/gptbase-parser/loader/mineru)"
)
parser.add_argument(
"--dir2",
default="/Users/moshui/Documents/felo/moshui/MaxKB/apps/common/handle/impl/mineru",
help="第二个目录路径"
)
args = parser.parse_args()
syncer = DirectorySyncer(args.dir1, args.dir2, verbose=args.verbose)
if not syncer.dir1.exists():
print(f"错误: 目录不存在 - {syncer.dir1}")
sys.exit(1)
if not syncer.dir2.exists():
print(f"错误: 目录不存在 - {syncer.dir2}")
sys.exit(1)
if args.dry_run:
print("模拟运行模式 - 不会实际修改文件")
syncer.verbose = True
if args.watch:
syncer.sync_directories()
syncer.watch()
else:
synced, deleted = syncer.sync_directories()
print("-" * 60)
print(f"同步完成: {synced} 个文件已同步")
if deleted > 0:
print(f" {deleted} 个文件已删除")
if __name__ == "__main__":
main()

View File

@ -361,6 +361,35 @@ const putBatchRefresh: (
) )
} }
/**
*
* @param knowledge_id,
*{
"id_list": [
"3fa85f64-5717-4562-b3fc-2c963f66afa6"
],
"llm_model": "model_id",
"vision_model": "model_id"
}
*/
const putBatchAdvancedLearning: (
knowledge_id: string,
idList: string[],
models: { llmModel: string; visionModel: string },
loading?: Ref<boolean>,
) => Promise<Result<boolean>> = (knowledge_id, idList, models, loading) => {
return put(
`${prefix.value}/${knowledge_id}/document/batch_advanced_learning`,
{
id_list: idList,
llm_model: models.llmModel,
vision_model: models.visionModel,
},
undefined,
loading,
)
}
/** /**
* *
* @param knowledge_id, * @param knowledge_id,
@ -582,6 +611,7 @@ export default {
putBatchGenerateRelated, putBatchGenerateRelated,
putBatchEditHitHandling, putBatchEditHitHandling,
putBatchRefresh, putBatchRefresh,
putBatchAdvancedLearning,
putMulSyncDocument, putMulSyncDocument,
putMigrateMulDocument, putMigrateMulDocument,
postQADocument, postQADocument,

View File

@ -111,4 +111,5 @@ export default {
copyTitle: '副本', copyTitle: '副本',
professional: '购买专业版', professional: '购买专业版',
sync: '同步', sync: '同步',
shared: '共享',
} }

View File

@ -42,7 +42,7 @@ export default {
fileType: { fileType: {
txt: { txt: {
label: '文本文件', label: '普通学习',
tip1: '1、文件上传前建议规范文件的分段标识', tip1: '1、文件上传前建议规范文件的分段标识',
}, },
table: { table: {
@ -97,6 +97,7 @@ export default {
name: '文件名称', name: '文件名称',
char_length: '字符数', char_length: '字符数',
paragraph: '分段', paragraph: '分段',
learningType: '学习方式',
all: '全部', all: '全部',
updateTime: '更新时间', updateTime: '更新时间',
}, },
@ -109,6 +110,7 @@ export default {
GENERATE: '生成中', GENERATE: '生成中',
SYNC: '同步中', SYNC: '同步中',
REVOKE: '取消中', REVOKE: '取消中',
PARSING: '学习中',
finish: '完成', finish: '完成',
}, },
enableStatus: { enableStatus: {
@ -181,4 +183,23 @@ export default {
allCheck: '全选', allCheck: '全选',
errorMessage1: '请选择文档', errorMessage1: '请选择文档',
}, },
advancedLearning: {
title: '高级学习',
button: '高级学习',
llmModel: '大语言模型',
visionModel: '视觉模型',
selectLlmModel: '请选择大语言模型',
selectVisionModel: '请选择视觉模型',
llmModelRequired: '请选择大语言模型',
visionModelRequired: '请选择视觉模型',
tip1: '高级学习提供高质量的 PDF 和 PPT 文档解析,支持复杂表格、图片、公式等内容',
tip2: '原有段落和索引将被删除,文档将使用高级学习重新解析',
tip3: '解析过程可能需要较长时间,请耐心等待',
successMessage: '高级学习任务已启动',
loadModelsFailed: '加载模型列表失败',
},
learningType: {
regular: '普通学习',
advanced: '高级学习',
},
} }

View File

@ -20,6 +20,8 @@ interface StateInterface {
REVOKE: '4' REVOKE: '4'
// 取消成功 // 取消成功
REVOKED: '5' REVOKED: '5'
// 解析中
PARSING: '6'
IGNORED: 'n' IGNORED: 'n'
} }
const TaskType: TaskTypeInterface = { const TaskType: TaskTypeInterface = {
@ -40,6 +42,8 @@ const State: StateInterface = {
REVOKE: '4', REVOKE: '4',
// 取消成功 // 取消成功
REVOKED: '5', REVOKED: '5',
// 解析中
PARSING: '6',
IGNORED: 'n' IGNORED: 'n'
} }
class Status { class Status {

View File

@ -0,0 +1,185 @@
<template>
<el-dialog
v-model="dialogVisible"
:title="$t('views.document.advancedLearning.title')"
:before-close="close"
width="630px"
>
<el-form :model="form" :rules="rules" ref="formRef" label-position="top" class="form-container">
<el-form-item :label="$t('views.document.advancedLearning.llmModel')" prop="llmModel">
<el-select
v-model="form.llmModel"
:placeholder="$t('views.document.advancedLearning.selectLlmModel')"
class="w-full"
clearable
>
<el-option
v-for="model in llmModels"
:key="model.id"
:label="model.name"
:value="model.id"
>
<span>{{ model.name }}</span>
<el-tag v-if="model.type === 'share'" size="small" class="ml-8">{{
$t('common.shared')
}}</el-tag>
</el-option>
</el-select>
</el-form-item>
<el-form-item :label="$t('views.document.advancedLearning.visionModel')" prop="visionModel">
<el-select
v-model="form.visionModel"
:placeholder="$t('views.document.advancedLearning.selectVisionModel')"
class="w-full"
clearable
>
<el-option
v-for="model in visionModels"
:key="model.id"
:label="model.name"
:value="model.id"
>
<span>{{ model.name }}</span>
<el-tag v-if="model.type === 'share'" size="small" class="ml-8">{{
$t('common.shared')
}}</el-tag>
</el-option>
</el-select>
</el-form-item>
<div class="update-info flex border-r-6 mb-16" style="padding: 8px 12px;">
<div class="mt-4">
<AppIcon iconName="app-warning-colorful" style="font-size: 16px"></AppIcon>
</div>
<div class="ml-16 lighter" style="padding-right: 12px;">
<p>{{ $t('views.document.advancedLearning.tip1') }}</p>
<p>{{ $t('views.document.advancedLearning.tip2') }}</p>
<p>{{ $t('views.document.advancedLearning.tip3') }}</p>
</div>
</div>
</el-form>
<template #footer>
<div class="dialog-footer">
<el-button @click="close">{{ $t('common.cancel') }}</el-button>
<el-button type="primary" @click="submit" :loading="loading">
{{ $t('common.submit') }}
</el-button>
</div>
</template>
</el-dialog>
</template>
<script setup lang="ts">
import { ref, reactive } from 'vue'
import modelApi from '@/api/model/model'
import type { FormInstance, FormRules } from 'element-plus'
import { MsgError } from '@/utils/message'
import { useI18n } from 'vue-i18n'
const { t } = useI18n()
const dialogVisible = ref<boolean>(false)
const loading = ref<boolean>(false)
const formRef = ref<FormInstance>()
const form = reactive({
llmModel: null as string | null,
visionModel: null as string | null,
})
const rules = reactive<FormRules>({
llmModel: [
{
required: true,
message: t('views.document.advancedLearning.llmModelRequired'),
trigger: 'change',
},
],
visionModel: [
{
required: true,
message: t('views.document.advancedLearning.visionModelRequired'),
trigger: 'change',
},
],
})
const llmModels = ref<any[]>([])
const visionModels = ref<any[]>([])
const submit_handle = ref<(models: { llmModel: string; visionModel: string }) => void>()
const loadModels = async () => {
try {
const response = await modelApi.getSelectModelList()
if (response.data) {
llmModels.value = response.data.filter((m: any) => m.model_type === 'LLM')
visionModels.value = response.data.filter(
(m: any) => m.model_type === 'IMAGE',
)
}
} catch (error) {
console.error('Failed to load models:', error)
MsgError(t('views.document.advancedLearning.loadModelsFailed'))
}
}
const submit = async () => {
if (!formRef.value) return
await formRef.value.validate((valid) => {
if (valid) {
if (submit_handle.value && form.llmModel && form.visionModel) {
loading.value = true
submit_handle.value({
llmModel: form.llmModel,
visionModel: form.visionModel,
})
}
}
})
}
const open = (handle: (models: { llmModel: string; visionModel: string }) => void) => {
submit_handle.value = handle
loadModels()
dialogVisible.value = true
}
const close = () => {
loading.value = false
form.llmModel = null
form.visionModel = null
formRef.value?.resetFields()
submit_handle.value = undefined
dialogVisible.value = false
}
defineExpose({ open, close })
</script>
<style lang="scss" scoped>
.update-info {
background: #fef0e6;
color: #8a6d3b;
font-size: 14px;
}
.form-container {
width: 100%;
:deep(.el-form-item) {
margin-bottom: 22px;
}
:deep(.el-form-item__content) {
width: 100%;
}
:deep(.el-select) {
width: 100% !important;
}
}
.w-full {
width: 100% !important;
}
</style>

View File

@ -35,6 +35,10 @@
<el-icon class="is-loading color-primary"><Loading /></el-icon> <el-icon class="is-loading color-primary"><Loading /></el-icon>
{{ stateMap[aggStatus.value](aggStatus.key) }} {{ stateMap[aggStatus.value](aggStatus.key) }}
</el-text> </el-text>
<el-text v-else-if="aggStatus?.value === State.PARSING">
<el-icon class="is-loading color-primary"><Loading /></el-icon>
{{ stateMap[aggStatus.value](aggStatus.key) }}
</el-text>
</template> </template>
</el-popover> </el-popover>
</template> </template>
@ -49,6 +53,7 @@ const checkList: Array<string> = [
State.REVOKE, State.REVOKE,
State.STARTED, State.STARTED,
State.PENDING, State.PENDING,
State.PARSING,
State.FAILURE, State.FAILURE,
State.REVOKED, State.REVOKED,
State.SUCCESS State.SUCCESS
@ -82,6 +87,7 @@ const stateMap: any = {
[State.REVOKED]: (type: number) => t('views.document.fileStatus.SUCCESS'), [State.REVOKED]: (type: number) => t('views.document.fileStatus.SUCCESS'),
[State.FAILURE]: (type: number) => t('views.document.fileStatus.FAILURE'), [State.FAILURE]: (type: number) => t('views.document.fileStatus.FAILURE'),
[State.SUCCESS]: (type: number) => t('views.document.fileStatus.SUCCESS'), [State.SUCCESS]: (type: number) => t('views.document.fileStatus.SUCCESS'),
[State.PARSING]: (type: number) => t('views.document.fileStatus.PARSING'),
} }
</script> </script>
<style lang="scss" scoped></style> <style lang="scss" scoped></style>

View File

@ -44,6 +44,12 @@
v-if="permissionPrecise.doc_vector(id)" v-if="permissionPrecise.doc_vector(id)"
>{{ $t('views.knowledge.setting.vectorization') }} >{{ $t('views.knowledge.setting.vectorization') }}
</el-button> </el-button>
<el-button
@click="openAdvancedLearningDialog"
:disabled="multipleSelection.length === 0"
v-if="permissionPrecise.doc_vector(id)"
>{{ $t('views.document.advancedLearning.button') }}
</el-button>
<el-button <el-button
@click="openGenerateDialog()" @click="openGenerateDialog()"
:disabled="multipleSelection.length === 0" :disabled="multipleSelection.length === 0"
@ -239,6 +245,19 @@
sortable sortable
/> />
<el-table-column
prop="learning_type"
:label="$t('views.document.table.learningType')"
align="center"
min-width="100"
>
<template #default="{ row }">
<el-tag :type="row.learning_type === 'advanced' ? 'danger' : 'success'" size="small">
{{ $t(`views.document.learningType.${row.learning_type || 'regular'}`) }}
</el-tag>
</template>
</el-table-column>
<el-table-column width="130"> <el-table-column width="130">
<template #header> <template #header>
<div> <div>
@ -635,6 +654,7 @@
</div> </div>
<EmbeddingContentDialog ref="embeddingContentDialogRef"></EmbeddingContentDialog> <EmbeddingContentDialog ref="embeddingContentDialogRef"></EmbeddingContentDialog>
<AdvancedLearningDialog ref="advancedLearningDialogRef"></AdvancedLearningDialog>
<ImportDocumentDialog ref="ImportDocumentDialogRef" :title="title" @refresh="refresh" /> <ImportDocumentDialog ref="ImportDocumentDialogRef" :title="title" @refresh="refresh" />
<SyncWebDialog ref="SyncWebDialogRef" @refresh="refresh" /> <SyncWebDialog ref="SyncWebDialogRef" @refresh="refresh" />
@ -662,6 +682,7 @@ import useStore from '@/stores'
import StatusValue from '@/views/document/component/Status.vue' import StatusValue from '@/views/document/component/Status.vue'
import GenerateRelatedDialog from '@/components/generate-related-dialog/index.vue' import GenerateRelatedDialog from '@/components/generate-related-dialog/index.vue'
import EmbeddingContentDialog from '@/views/document/component/EmbeddingContentDialog.vue' import EmbeddingContentDialog from '@/views/document/component/EmbeddingContentDialog.vue'
import AdvancedLearningDialog from '@/views/document/component/AdvancedLearningDialog.vue'
import { TaskType, State } from '@/utils/status' import { TaskType, State } from '@/utils/status'
import { t } from '@/locales' import { t } from '@/locales'
import permissionMap from '@/permission' import permissionMap from '@/permission'
@ -746,6 +767,7 @@ const getTaskState = (status: string, taskType: number) => {
const beforePagination = computed(() => common.paginationConfig[storeKey]) const beforePagination = computed(() => common.paginationConfig[storeKey])
const beforeSearch = computed(() => common.search[storeKey]) const beforeSearch = computed(() => common.search[storeKey])
const embeddingContentDialogRef = ref<InstanceType<typeof EmbeddingContentDialog>>() const embeddingContentDialogRef = ref<InstanceType<typeof EmbeddingContentDialog>>()
const advancedLearningDialogRef = ref<InstanceType<typeof AdvancedLearningDialog>>()
const SyncWebDialogRef = ref() const SyncWebDialogRef = ref()
const loading = ref(false) const loading = ref(false)
let interval: any let interval: any
@ -1041,6 +1063,24 @@ function batchRefresh() {
embeddingContentDialogRef.value?.open(embeddingBatchDocument) embeddingContentDialogRef.value?.open(embeddingBatchDocument)
} }
function openAdvancedLearningDialog() {
const arr: string[] = multipleSelection.value.map((v) => v.id)
const advancedLearningDocument = (models: { llmModel: string; visionModel: string }) => {
loadSharedApi({ type: 'document', systemType: apiType.value })
.putBatchAdvancedLearning(id, arr, models, loading)
.then(() => {
MsgSuccess(t('views.document.advancedLearning.successMessage'))
multipleTableRef.value?.clearSelection()
advancedLearningDialogRef.value?.close()
getList()
})
.catch(() => {
advancedLearningDialogRef.value?.close()
})
}
advancedLearningDialogRef.value?.open(advancedLearningDocument)
}
function downloadDocument(row: any) { function downloadDocument(row: any) {
loadSharedApi({ type: 'document', systemType: apiType.value }) loadSharedApi({ type: 'document', systemType: apiType.value })
.getDownloadSourceFile(id, row.id, row.name) .getDownloadSourceFile(id, row.id, row.name)

View File

@ -11,11 +11,11 @@
<div class="mt-16 mb-16"> <div class="mt-16 mb-16">
<el-radio-group v-model="form.fileType" @change="radioChange" class="app-radio-button-group"> <el-radio-group v-model="form.fileType" @change="radioChange" class="app-radio-button-group">
<el-radio-button value="txt">{{ $t('views.document.fileType.txt.label') }}</el-radio-button> <el-radio-button value="txt">{{ $t('views.document.fileType.txt.label') }}</el-radio-button>
<el-radio-button value="mineru">高级学习</el-radio-button>
<el-radio-button value="table">{{ <el-radio-button value="table">{{
$t('views.document.fileType.table.label') $t('views.document.fileType.table.label')
}}</el-radio-button> }}</el-radio-button>
<el-radio-button value="QA">{{ $t('views.document.fileType.QA.label') }}</el-radio-button> <el-radio-button value="QA">{{ $t('views.document.fileType.QA.label') }}</el-radio-button>
<el-radio-button value="mineru">MinerU</el-radio-button>
</el-radio-group> </el-radio-group>
</div> </div>
@ -140,7 +140,7 @@
<AppIcon iconName="app-warning-colorful" style="font-size: 16px"></AppIcon> <AppIcon iconName="app-warning-colorful" style="font-size: 16px"></AppIcon>
</div> </div>
<div class="ml-16 lighter"> <div class="ml-16 lighter">
<p>1. MinerU 提供高质量的 PDF PPT 文档解析支持复杂表格图片公式等内容</p> <p>1. 高级学习提供高质量的 PDF PPT 文档解析支持复杂表格图片公式等内容</p>
<p>2. 支持的文件格式PDFPPTPPTX</p> <p>2. 支持的文件格式PDFPPTPPTX</p>
<p> <p>
3. {{ $t('views.document.tip.fileLimitCountTip1') }} {{ file_count_limit }} 3. {{ $t('views.document.tip.fileLimitCountTip1') }} {{ file_count_limit }}
@ -361,7 +361,7 @@ const loadModels = async () => {
// //
llmModels.value = response.data.filter((m: any) => m.model_type === 'LLM') llmModels.value = response.data.filter((m: any) => m.model_type === 'LLM')
visionModels.value = response.data.filter((m: any) => visionModels.value = response.data.filter((m: any) =>
m.model_type === 'IMAGE' || m.model_type === 'LLM' // LLM m.model_type === 'IMAGE' // IMAGE
) )
} }
} catch (error) { } catch (error) {