modify status parsing
This commit is contained in:
parent
f0263bf189
commit
f1494fedea
@ -26,7 +26,7 @@ from common.utils.common import post
|
||||
from knowledge.models import Paragraph, Document, Problem, ProblemParagraphMapping, Knowledge
|
||||
from knowledge.serializers.common import get_embedding_model_id_by_knowledge_id, update_document_char_length
|
||||
from knowledge.serializers.paragraph import ParagraphSerializers
|
||||
from knowledge.task.embedding import embedding_by_paragraph, embedding_by_paragraph_list
|
||||
from knowledge.tasks.embedding import embedding_by_paragraph, embedding_by_paragraph_list
|
||||
|
||||
|
||||
class ChatRecordSerializerModel(serializers.ModelSerializer):
|
||||
|
||||
@ -275,7 +275,7 @@ class ListenerManagement:
|
||||
return
|
||||
maxkb_logger.info(_('Start--->Embedding document: {document_id}').format(document_id=document_id)
|
||||
)
|
||||
# 批量修改状态为PADDING
|
||||
# 批量修改状态为STARTED
|
||||
ListenerManagement.update_status(QuerySet(Document).filter(id=document_id), TaskType.EMBEDDING,
|
||||
State.STARTED)
|
||||
|
||||
|
||||
@ -11,7 +11,7 @@ GPTBase平台适配器 - 实现GPTBase特定的功能
|
||||
import contextlib
|
||||
from typing import Any, Dict
|
||||
from .logger import logger
|
||||
|
||||
import os
|
||||
from .gptbase_utils import GPTBaseUtils, GZeroUtils
|
||||
from .config_gptbase import GPTBaseMinerUConfig
|
||||
from ..base_parser import PlatformAdapter, BaseMinerUExtractor, ProcessingResult
|
||||
|
||||
@ -242,7 +242,7 @@ class MinerUAdapter:
|
||||
raise
|
||||
|
||||
def process_document(self, file_content: bytes, file_name: str,
|
||||
save_image_func=None) -> Dict[str, Any]:
|
||||
save_image_func=None, **kwargs) -> Dict[str, Any]:
|
||||
"""
|
||||
处理文档并返回结构化内容
|
||||
|
||||
@ -250,6 +250,7 @@ class MinerUAdapter:
|
||||
file_content: 文件内容字节流
|
||||
file_name: 文件名
|
||||
save_image_func: 保存图片的函数
|
||||
**kwargs: 额外参数,包括llm_model_id和vision_model_id
|
||||
|
||||
Returns:
|
||||
包含sections的字典,每个section包含content、title和images
|
||||
@ -276,6 +277,14 @@ class MinerUAdapter:
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
try:
|
||||
# 提取模型ID参数
|
||||
llm_model_id = kwargs.get('llm_model_id')
|
||||
vision_model_id = kwargs.get('vision_model_id')
|
||||
if llm_model_id and vision_model_id:
|
||||
logger.info(f"使用指定模型处理文档: LLM={llm_model_id}, Vision={vision_model_id}")
|
||||
# TODO: 将模型ID传递给extractor
|
||||
# 目前暂时使用默认配置,后续可以在这里设置模型
|
||||
|
||||
result = loop.run_until_complete(
|
||||
self.extractor.process_file(tmp_file_path, file_name)
|
||||
)
|
||||
|
||||
@ -41,29 +41,43 @@ class MinerUSplitHandle(BaseSplitHandle):
|
||||
return True
|
||||
|
||||
def handle(self, file, pattern_list: List, with_filter: bool, limit: int,
|
||||
get_buffer, save_image):
|
||||
get_buffer, save_image, **kwargs):
|
||||
"""
|
||||
使用MinerU处理文档
|
||||
"""
|
||||
try:
|
||||
logger.info(f"MinerUSplitHandle.handle called for file: {file.name if hasattr(file, 'name') else 'unknown'}")
|
||||
|
||||
# 初始化MinerU适配器
|
||||
# 初始化MinerU适配器,传递模型ID(如果提供)
|
||||
if not self.mineru_adapter:
|
||||
logger.info("Initializing MinerU adapter")
|
||||
llm_model_id = kwargs.get('llm_model_id')
|
||||
vision_model_id = kwargs.get('vision_model_id')
|
||||
if llm_model_id and vision_model_id:
|
||||
logger.info(f"Using models: LLM={llm_model_id}, Vision={vision_model_id}")
|
||||
self.mineru_adapter = MinerUAdapter()
|
||||
|
||||
# 获取文件内容
|
||||
buffer = get_buffer(file)
|
||||
logger.info(f"File buffer size: {len(buffer) if buffer else 0} bytes")
|
||||
|
||||
# 处理文档
|
||||
# 处理文档,传递模型ID到适配器
|
||||
logger.info("Calling MinerU adapter to process document")
|
||||
result = self.mineru_adapter.process_document(
|
||||
file_content=buffer,
|
||||
file_name=file.name if hasattr(file, 'name') else 'document.pdf',
|
||||
save_image_func=save_image
|
||||
)
|
||||
process_kwargs = {
|
||||
'file_content': buffer,
|
||||
'file_name': file.name if hasattr(file, 'name') else 'document.pdf',
|
||||
'save_image_func': save_image
|
||||
}
|
||||
|
||||
# 如果有模型ID,传递给适配器
|
||||
llm_model_id = kwargs.get('llm_model_id')
|
||||
vision_model_id = kwargs.get('vision_model_id')
|
||||
if llm_model_id:
|
||||
process_kwargs['llm_model_id'] = llm_model_id
|
||||
if vision_model_id:
|
||||
process_kwargs['vision_model_id'] = vision_model_id
|
||||
|
||||
result = self.mineru_adapter.process_document(**process_kwargs)
|
||||
logger.info(f"MinerU adapter returned result with {len(result.get('sections', []))} sections")
|
||||
|
||||
# 转换为段落格式
|
||||
|
||||
@ -6,7 +6,8 @@ from common.result import DefaultResultSerializer
|
||||
from knowledge.serializers.common import BatchSerializer
|
||||
from knowledge.serializers.document import DocumentInstanceSerializer, DocumentWebInstanceSerializer, \
|
||||
CancelInstanceSerializer, BatchCancelInstanceSerializer, DocumentRefreshSerializer, BatchEditHitHandlingSerializer, \
|
||||
DocumentBatchRefreshSerializer, DocumentBatchGenerateRelatedSerializer, DocumentMigrateSerializer
|
||||
DocumentBatchRefreshSerializer, DocumentBatchGenerateRelatedSerializer, DocumentMigrateSerializer, \
|
||||
DocumentBatchAdvancedLearningSerializer
|
||||
|
||||
|
||||
class DocumentSplitAPI(APIMixin):
|
||||
@ -473,6 +474,35 @@ class DocumentExportAPI(APIMixin):
|
||||
return DefaultResultSerializer
|
||||
|
||||
|
||||
class BatchAdvancedLearningAPI(APIMixin):
|
||||
@staticmethod
|
||||
def get_parameters():
|
||||
return [
|
||||
OpenApiParameter(
|
||||
name="workspace_id",
|
||||
description="工作空间id",
|
||||
type=OpenApiTypes.STR,
|
||||
location='path',
|
||||
required=True,
|
||||
),
|
||||
OpenApiParameter(
|
||||
name="knowledge_id",
|
||||
description="知识库id",
|
||||
type=OpenApiTypes.STR,
|
||||
location='path',
|
||||
required=True,
|
||||
),
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def get_request():
|
||||
return DocumentBatchAdvancedLearningSerializer
|
||||
|
||||
@staticmethod
|
||||
def get_response():
|
||||
return DefaultResultSerializer
|
||||
|
||||
|
||||
class DocumentMigrateAPI(APIMixin):
|
||||
@staticmethod
|
||||
def get_parameters():
|
||||
|
||||
@ -47,6 +47,8 @@ class State(Enum):
|
||||
REVOKE = '4'
|
||||
# 取消成功
|
||||
REVOKED = '5'
|
||||
# 解析中
|
||||
PARSING = '6'
|
||||
# 忽略
|
||||
IGNORED = 'n'
|
||||
|
||||
|
||||
@ -45,16 +45,16 @@ from common.utils.fork import Fork
|
||||
from common.utils.logger import maxkb_logger
|
||||
from common.utils.split_model import get_split_model, flat_map
|
||||
from knowledge.models import Knowledge, Paragraph, Problem, Document, KnowledgeType, ProblemParagraphMapping, State, \
|
||||
TaskType, File, FileSourceType
|
||||
TaskType, File, FileSourceType, get_default_status
|
||||
from knowledge.serializers.common import ProblemParagraphManage, BatchSerializer, \
|
||||
get_embedding_model_id_by_knowledge_id, MetaSerializer, write_image, zip_dir
|
||||
from knowledge.serializers.paragraph import ParagraphSerializers, ParagraphInstanceSerializer, \
|
||||
delete_problems_and_mappings
|
||||
from knowledge.task.embedding import embedding_by_document, delete_embedding_by_document_list, \
|
||||
from knowledge.tasks.embedding import embedding_by_document, delete_embedding_by_document_list, \
|
||||
delete_embedding_by_document, delete_embedding_by_paragraph_ids, embedding_by_document_list, \
|
||||
update_embedding_knowledge_id
|
||||
from knowledge.task.generate import generate_related_by_document_id
|
||||
from knowledge.task.sync import sync_web_document
|
||||
from knowledge.tasks.generate import generate_related_by_document_id
|
||||
from knowledge.tasks.sync import sync_web_document
|
||||
from maxkb.const import PROJECT_DIR
|
||||
from models_provider.models import Model
|
||||
from oss.serializers.file import FileSerializer
|
||||
@ -200,6 +200,12 @@ class DocumentMigrateSerializer(serializers.Serializer):
|
||||
document_id_list = serializers.ListField(required=True, label=_('document id list'))
|
||||
|
||||
|
||||
class DocumentBatchAdvancedLearningSerializer(serializers.Serializer):
|
||||
id_list = serializers.ListField(required=True, label=_('document id list'))
|
||||
llm_model = serializers.CharField(required=True, label=_('llm model id'))
|
||||
vision_model = serializers.CharField(required=True, label=_('vision model id'))
|
||||
|
||||
|
||||
class BatchEditHitHandlingSerializer(serializers.Serializer):
|
||||
id_list = serializers.ListField(required=True, child=serializers.UUIDField(required=True), label=_('id list'))
|
||||
hit_handling_method = serializers.CharField(required=True, label=_('hit handling method'))
|
||||
@ -866,6 +872,11 @@ class DocumentSerializers(serializers.Serializer):
|
||||
@staticmethod
|
||||
def get_document_paragraph_model(knowledge_id, instance: Dict):
|
||||
source_meta = {'source_file_id': instance.get('source_file_id')} if instance.get('source_file_id') else {}
|
||||
# 添加MinerU模型参数到meta
|
||||
if instance.get('llm_model_id'):
|
||||
source_meta['llm_model_id'] = instance.get('llm_model_id')
|
||||
if instance.get('vision_model_id'):
|
||||
source_meta['vision_model_id'] = instance.get('vision_model_id')
|
||||
meta = {**instance.get('meta'), **source_meta} if instance.get('meta') is not None else source_meta
|
||||
meta = convert_uuid_to_str(meta)
|
||||
|
||||
@ -1156,99 +1167,10 @@ class DocumentSerializers(serializers.Serializer):
|
||||
maxkb_logger.info(f"Processing document: {document.get('name')}, llm_model_id: {llm_model_id}, vision_model_id: {vision_model_id}")
|
||||
|
||||
if llm_model_id or vision_model_id:
|
||||
maxkb_logger.info(f"Document {document.get('name')} is MinerU type, processing with MinerU handler")
|
||||
source_file_id = document.get('source_file_id')
|
||||
maxkb_logger.info(f"Source file ID: {source_file_id}")
|
||||
if source_file_id:
|
||||
# 获取源文件
|
||||
source_file = QuerySet(File).filter(id=source_file_id).first()
|
||||
maxkb_logger.info(f"Source file found: {source_file is not None}")
|
||||
if source_file:
|
||||
try:
|
||||
# 使用MinerU处理器重新解析文档
|
||||
from common.handle.impl.text.mineru_split_handle import MinerUSplitHandle
|
||||
from common.utils.split_model import get_split_model
|
||||
import io
|
||||
|
||||
# 检查MinerU配置
|
||||
mineru_api_type = os.environ.get('MINERU_API_TYPE', '')
|
||||
if not mineru_api_type:
|
||||
maxkb_logger.warning(f"MinerU API not configured, skipping MinerU processing for document: {document.get('name')}")
|
||||
continue
|
||||
|
||||
maxkb_logger.info(f"MinerU API configured: {mineru_api_type}")
|
||||
mineru_handler = MinerUSplitHandle()
|
||||
|
||||
# 获取文件内容
|
||||
file_content = source_file.get_bytes()
|
||||
temp_file = io.BytesIO(file_content)
|
||||
temp_file.name = source_file.file_name
|
||||
|
||||
# 从现有段落中提取分段模式
|
||||
# 获取用户在预览时设置的分段规则
|
||||
pattern_list = []
|
||||
if 'paragraphs' in document and len(document['paragraphs']) > 0:
|
||||
# 尝试从元数据或其他地方获取分段模式
|
||||
# 这里我们需要从前端传递分段规则
|
||||
patterns = document.get('split_patterns', [])
|
||||
if patterns:
|
||||
pattern_list = [get_split_model(pattern) for pattern in patterns if pattern]
|
||||
|
||||
def get_buffer(file):
|
||||
file.seek(0)
|
||||
return file.read()
|
||||
|
||||
def save_image(image_list):
|
||||
if image_list is not None and len(image_list) > 0:
|
||||
exist_image_list = [str(i.get('id')) for i in
|
||||
QuerySet(File).filter(id__in=[i.id for i in image_list]).values('id')]
|
||||
save_image_list = [image for image in image_list if not exist_image_list.__contains__(str(image.id))]
|
||||
save_image_list = list({img.id: img for img in save_image_list}.values())
|
||||
for file in save_image_list:
|
||||
file_bytes = file.meta.pop('content')
|
||||
file.meta['knowledge_id'] = knowledge_id
|
||||
file.source_type = FileSourceType.KNOWLEDGE
|
||||
file.source_id = knowledge_id
|
||||
file.save(file_bytes)
|
||||
|
||||
# 使用MinerU处理,不传递is_preview参数,这样MinerU会被使用
|
||||
maxkb_logger.info(f"Using MinerU to process document: {document.get('name')}")
|
||||
paragraphs = mineru_handler.handle(
|
||||
temp_file,
|
||||
pattern_list,
|
||||
False, # with_filter
|
||||
0, # limit (0表示不限制)
|
||||
get_buffer,
|
||||
save_image
|
||||
)
|
||||
|
||||
# 如果原来有问题列表,需要保留
|
||||
if 'paragraphs' in document and document['paragraphs']:
|
||||
# 尝试将原有的问题列表合并到新的段落中
|
||||
old_problems = {}
|
||||
for old_para in document['paragraphs']:
|
||||
if 'content' in old_para and 'problem_list' in old_para:
|
||||
old_problems[old_para['content'][:100]] = old_para['problem_list']
|
||||
|
||||
# 为新段落添加问题列表(如果内容相似)
|
||||
for new_para in paragraphs:
|
||||
content_key = new_para.get('content', '')[:100]
|
||||
if content_key in old_problems:
|
||||
new_para['problem_list'] = old_problems[content_key]
|
||||
|
||||
# 替换文档的段落为MinerU处理后的结果
|
||||
maxkb_logger.info(f"MinerU returned {len(paragraphs) if paragraphs else 0} paragraphs")
|
||||
if paragraphs and len(paragraphs) > 0:
|
||||
maxkb_logger.info(f"First paragraph sample: {paragraphs[0] if paragraphs else 'None'}")
|
||||
document['paragraphs'] = paragraphs
|
||||
else:
|
||||
maxkb_logger.warning(f"MinerU returned empty paragraphs, keeping original paragraphs")
|
||||
maxkb_logger.info(f"MinerU processing completed for document: {document.get('name')}, paragraphs count: {len(document.get('paragraphs', []))}")
|
||||
|
||||
except Exception as e:
|
||||
# 如果MinerU处理失败,保持原有段落
|
||||
maxkb_logger.error(f"MinerU processing failed for document {document.get('name')}: {str(e)}", exc_info=True)
|
||||
# 保持原有段落不变
|
||||
maxkb_logger.info(f"Document {document.get('name')} is MinerU type, will process asynchronously")
|
||||
# MinerU类型的文档,保存基本信息,不处理段落
|
||||
# 段落处理将通过异步任务进行
|
||||
document['paragraphs'] = [] # 清空段落,等待异步处理
|
||||
|
||||
# 插入文档
|
||||
for document in instance_list:
|
||||
@ -1270,7 +1192,44 @@ class DocumentSerializers(serializers.Serializer):
|
||||
)
|
||||
# 插入文档
|
||||
QuerySet(Document).bulk_create(document_model_list) if len(document_model_list) > 0 else None
|
||||
# 批量插入段落
|
||||
|
||||
# 处理高级学习文档的异步任务
|
||||
for idx, document in enumerate(instance_list):
|
||||
llm_model_id = document.get('llm_model_id')
|
||||
vision_model_id = document.get('vision_model_id')
|
||||
if llm_model_id and vision_model_id and document_model_list:
|
||||
# 找到对应的文档模型
|
||||
document_model = document_model_list[idx]
|
||||
maxkb_logger.info(f"Submitting async advanced learning task for document: {document_model.id}")
|
||||
|
||||
# 设置文档状态为解析中
|
||||
ListenerManagement.update_status(
|
||||
QuerySet(Document).filter(id=document_model.id),
|
||||
TaskType.EMBEDDING,
|
||||
State.PARSING
|
||||
)
|
||||
|
||||
# 提交异步任务
|
||||
try:
|
||||
from knowledge.tasks.advanced_learning import advanced_learning_by_document
|
||||
advanced_learning_by_document.delay(
|
||||
str(document_model.id),
|
||||
str(knowledge_id),
|
||||
self.data.get('workspace_id', ''),
|
||||
llm_model_id,
|
||||
vision_model_id
|
||||
)
|
||||
maxkb_logger.info(f"Advanced learning task submitted for document {document_model.id}")
|
||||
except Exception as e:
|
||||
maxkb_logger.error(f"Failed to submit advanced learning task: {str(e)}")
|
||||
# 如果提交失败,更新状态为失败
|
||||
ListenerManagement.update_status(
|
||||
QuerySet(Document).filter(id=document_model.id),
|
||||
TaskType.EMBEDDING,
|
||||
State.FAILURE
|
||||
)
|
||||
|
||||
# 批量插入段落(只为非高级学习文档)
|
||||
if len(paragraph_model_list) > 0:
|
||||
for document in document_model_list:
|
||||
max_position = Paragraph.objects.filter(document_id=document.id).aggregate(
|
||||
@ -1439,6 +1398,79 @@ class DocumentSerializers(serializers.Serializer):
|
||||
pass
|
||||
|
||||
|
||||
class BatchAdvancedLearning(serializers.Serializer):
|
||||
workspace_id = serializers.CharField(required=True, label=_('workspace id'))
|
||||
knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id'))
|
||||
|
||||
def is_valid(self, *, raise_exception=False):
|
||||
super().is_valid(raise_exception=True)
|
||||
workspace_id = self.data.get('workspace_id')
|
||||
query_set = QuerySet(Knowledge).filter(id=self.data.get('knowledge_id'))
|
||||
if workspace_id:
|
||||
query_set = query_set.filter(workspace_id=workspace_id)
|
||||
if not query_set.exists():
|
||||
raise AppApiException(500, _('Knowledge id does not exist'))
|
||||
|
||||
def batch_advanced_learning(self, instance: Dict, with_valid=True):
|
||||
if with_valid:
|
||||
self.is_valid(raise_exception=True)
|
||||
|
||||
document_id_list = instance.get("id_list", [])
|
||||
llm_model_id = instance.get("llm_model")
|
||||
vision_model_id = instance.get("vision_model")
|
||||
|
||||
if not document_id_list:
|
||||
raise AppApiException(500, _('Document list is empty'))
|
||||
if not llm_model_id or not vision_model_id:
|
||||
raise AppApiException(500, _('Model selection is required'))
|
||||
|
||||
knowledge_id = self.data.get('knowledge_id')
|
||||
workspace_id = self.data.get('workspace_id')
|
||||
|
||||
# 获取知识库
|
||||
knowledge = QuerySet(Knowledge).filter(id=knowledge_id).first()
|
||||
if not knowledge:
|
||||
raise AppApiException(500, _('Knowledge not found'))
|
||||
|
||||
# 检查MinerU配置
|
||||
import os
|
||||
mineru_api_type = os.environ.get('MINERU_API_TYPE', '')
|
||||
if not mineru_api_type:
|
||||
raise AppApiException(500, _('MinerU API not configured'))
|
||||
|
||||
# 更新文档状态为解析中(而不是排队中)
|
||||
for document_id in document_id_list:
|
||||
ListenerManagement.update_status(
|
||||
QuerySet(Document).filter(id=document_id),
|
||||
TaskType.EMBEDDING,
|
||||
State.PARSING
|
||||
)
|
||||
|
||||
# 调用异步任务处理文档
|
||||
try:
|
||||
from knowledge.tasks.advanced_learning import batch_advanced_learning
|
||||
batch_advanced_learning.delay(
|
||||
document_id_list,
|
||||
str(knowledge_id),
|
||||
workspace_id,
|
||||
llm_model_id,
|
||||
vision_model_id
|
||||
)
|
||||
maxkb_logger.info(f"Submitted advanced learning tasks for {len(document_id_list)} documents")
|
||||
except Exception as e:
|
||||
maxkb_logger.error(f"Failed to submit advanced learning tasks: {str(e)}")
|
||||
# 如果提交任务失败,更新状态为失败
|
||||
for document_id in document_id_list:
|
||||
ListenerManagement.update_status(
|
||||
QuerySet(Document).filter(id=document_id),
|
||||
TaskType.EMBEDDING,
|
||||
State.FAILURE
|
||||
)
|
||||
raise AppApiException(500, _('Failed to submit advanced learning tasks'))
|
||||
|
||||
return True
|
||||
|
||||
|
||||
class FileBufferHandle:
|
||||
buffer = None
|
||||
|
||||
|
||||
@ -33,9 +33,9 @@ from knowledge.models import Knowledge, KnowledgeScope, KnowledgeType, Document,
|
||||
from knowledge.serializers.common import ProblemParagraphManage, drop_knowledge_index, get_embedding_model_id_by_knowledge_id, MetaSerializer, \
|
||||
GenerateRelatedSerializer, get_embedding_model_by_knowledge_id, list_paragraph, write_image, zip_dir
|
||||
from knowledge.serializers.document import DocumentSerializers
|
||||
from knowledge.task.embedding import embedding_by_knowledge, delete_embedding_by_knowledge
|
||||
from knowledge.task.generate import generate_related_by_knowledge_id
|
||||
from knowledge.task.sync import sync_web_knowledge, sync_replace_web_knowledge
|
||||
from knowledge.tasks.embedding import embedding_by_knowledge, delete_embedding_by_knowledge
|
||||
from knowledge.tasks.generate import generate_related_by_knowledge_id
|
||||
from knowledge.tasks.sync import sync_web_knowledge, sync_replace_web_knowledge
|
||||
from maxkb.conf import PROJECT_DIR
|
||||
from models_provider.models import Model
|
||||
from system_manage.models import WorkspaceUserResourcePermission, AuthTargetType
|
||||
|
||||
@ -18,11 +18,11 @@ from knowledge.models import Paragraph, Problem, Document, ProblemParagraphMappi
|
||||
from knowledge.serializers.common import ProblemParagraphObject, ProblemParagraphManage, \
|
||||
get_embedding_model_id_by_knowledge_id, update_document_char_length, BatchSerializer
|
||||
from knowledge.serializers.problem import ProblemInstanceSerializer, ProblemSerializer, ProblemSerializers
|
||||
from knowledge.task.embedding import embedding_by_paragraph, enable_embedding_by_paragraph, \
|
||||
from knowledge.tasks.embedding import embedding_by_paragraph, enable_embedding_by_paragraph, \
|
||||
disable_embedding_by_paragraph, \
|
||||
delete_embedding_by_paragraph, embedding_by_problem as embedding_by_problem_task, delete_embedding_by_paragraph_ids, \
|
||||
embedding_by_problem, delete_embedding_by_source, update_embedding_document_id
|
||||
from knowledge.task.generate import generate_related_by_paragraph_id_list
|
||||
from knowledge.tasks.generate import generate_related_by_paragraph_id_list
|
||||
|
||||
|
||||
class ParagraphSerializer(serializers.ModelSerializer):
|
||||
|
||||
@ -13,7 +13,7 @@ from common.exception.app_exception import AppApiException
|
||||
from common.utils.common import get_file_content
|
||||
from knowledge.models import Problem, ProblemParagraphMapping, Paragraph, Knowledge, SourceType
|
||||
from knowledge.serializers.common import get_embedding_model_id_by_knowledge_id
|
||||
from knowledge.task.embedding import delete_embedding_by_source_ids, update_problem_embedding, embedding_by_data_list
|
||||
from knowledge.tasks.embedding import delete_embedding_by_source_ids, update_problem_embedding, embedding_by_data_list
|
||||
from maxkb.const import PROJECT_DIR
|
||||
|
||||
|
||||
|
||||
@ -3,7 +3,11 @@ SELECT
|
||||
"document".* ,
|
||||
to_json("document"."meta") as meta,
|
||||
to_json("document"."status_meta") as status_meta,
|
||||
(SELECT "count"("id") FROM "paragraph" WHERE document_id="document"."id") as "paragraph_count"
|
||||
(SELECT "count"("id") FROM "paragraph" WHERE document_id="document"."id") as "paragraph_count",
|
||||
CASE
|
||||
WHEN "document"."meta"->>'llm_model_id' IS NOT NULL THEN 'advanced'
|
||||
ELSE 'regular'
|
||||
END as "learning_type"
|
||||
FROM
|
||||
"document" "document"
|
||||
${document_custom_sql}
|
||||
|
||||
5
apps/knowledge/tasks/__init__.py
Normal file
5
apps/knowledge/tasks/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# coding=utf-8
|
||||
|
||||
# Import tasks for Celery discovery
|
||||
# Note: We import the specific tasks, not * to avoid circular imports
|
||||
from .advanced_learning import advanced_learning_by_document, batch_advanced_learning
|
||||
208
apps/knowledge/tasks/advanced_learning.py
Normal file
208
apps/knowledge/tasks/advanced_learning.py
Normal file
@ -0,0 +1,208 @@
|
||||
# coding=utf-8
|
||||
"""
|
||||
高级学习任务 - 使用MinerU重新解析文档
|
||||
"""
|
||||
import traceback
|
||||
import uuid as uuid_lib
|
||||
from typing import List
|
||||
|
||||
from celery_once import QueueOnce
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from ops import celery_app
|
||||
|
||||
|
||||
@celery_app.task(name='celery:advanced_learning_by_document')
|
||||
def advanced_learning_by_document(document_id: str, knowledge_id: str, workspace_id: str,
|
||||
llm_model_id: str, vision_model_id: str):
|
||||
"""
|
||||
使用MinerU高级学习处理文档
|
||||
|
||||
@param document_id: 文档ID
|
||||
@param knowledge_id: 知识库ID
|
||||
@param workspace_id: 工作空间ID
|
||||
@param llm_model_id: 大语言模型ID
|
||||
@param vision_model_id: 视觉模型ID
|
||||
"""
|
||||
# 延迟导入,避免循环依赖
|
||||
from common.event import ListenerManagement
|
||||
from common.utils.logger import maxkb_logger
|
||||
from knowledge.models import Document, Knowledge, Paragraph, State, TaskType, File, FileSourceType, get_default_status
|
||||
from knowledge.serializers.paragraph import delete_problems_and_mappings
|
||||
from knowledge.tasks.embedding import delete_embedding_by_document, embedding_by_document
|
||||
|
||||
maxkb_logger.info(f"Starting advanced learning for document {document_id}")
|
||||
|
||||
try:
|
||||
# 立即更新状态为解析中
|
||||
from common.event import ListenerManagement
|
||||
ListenerManagement.update_status(
|
||||
QuerySet(Document).filter(id=document_id),
|
||||
TaskType.EMBEDDING,
|
||||
State.PARSING
|
||||
)
|
||||
maxkb_logger.info(f"Updated document {document_id} status to PARSING")
|
||||
|
||||
# 获取文档
|
||||
document = QuerySet(Document).filter(id=document_id).first()
|
||||
if not document:
|
||||
maxkb_logger.error(f"Document {document_id} not found")
|
||||
return
|
||||
|
||||
# 获取知识库
|
||||
knowledge = QuerySet(Knowledge).filter(id=knowledge_id).first()
|
||||
if not knowledge:
|
||||
maxkb_logger.error(f"Knowledge {knowledge_id} not found")
|
||||
return
|
||||
|
||||
# 获取源文件
|
||||
source_file_id = document.meta.get('source_file_id')
|
||||
if not source_file_id:
|
||||
maxkb_logger.warning(f"No source file for document {document.id}")
|
||||
ListenerManagement.update_status(
|
||||
QuerySet(Document).filter(id=document_id),
|
||||
TaskType.EMBEDDING,
|
||||
State.FAILURE
|
||||
)
|
||||
return
|
||||
|
||||
source_file = QuerySet(File).filter(id=source_file_id).first()
|
||||
if not source_file:
|
||||
maxkb_logger.warning(f"Source file not found for document {document.id}")
|
||||
ListenerManagement.update_status(
|
||||
QuerySet(Document).filter(id=document_id),
|
||||
TaskType.EMBEDDING,
|
||||
State.FAILURE
|
||||
)
|
||||
return
|
||||
|
||||
# 删除现有的段落和向量数据
|
||||
QuerySet(Paragraph).filter(document_id=document_id).delete()
|
||||
delete_problems_and_mappings([document_id])
|
||||
delete_embedding_by_document(document_id)
|
||||
|
||||
# 更新文档元数据,记录使用的模型
|
||||
document.meta['llm_model_id'] = llm_model_id
|
||||
document.meta['vision_model_id'] = vision_model_id
|
||||
document.save()
|
||||
|
||||
# 使用MinerU重新解析文档
|
||||
from common.handle.impl.text.mineru_split_handle import MinerUSplitHandle
|
||||
import io
|
||||
|
||||
mineru_handler = MinerUSplitHandle()
|
||||
|
||||
# 获取文件内容
|
||||
file_content = source_file.get_bytes()
|
||||
temp_file = io.BytesIO(file_content)
|
||||
temp_file.name = source_file.file_name
|
||||
|
||||
def get_buffer(file):
|
||||
file.seek(0)
|
||||
return file.read()
|
||||
|
||||
def save_image(image_list):
|
||||
if image_list is not None and len(image_list) > 0:
|
||||
exist_image_list = [str(i.get('id')) for i in
|
||||
QuerySet(File).filter(id__in=[i.id for i in image_list]).values('id')]
|
||||
save_image_list = [image for image in image_list if not exist_image_list.__contains__(str(image.id))]
|
||||
save_image_list = list({img.id: img for img in save_image_list}.values())
|
||||
for file in save_image_list:
|
||||
file_bytes = file.meta.pop('content')
|
||||
file.meta['knowledge_id'] = knowledge_id
|
||||
file.source_type = FileSourceType.KNOWLEDGE
|
||||
file.source_id = knowledge_id
|
||||
file.save(file_bytes)
|
||||
|
||||
# 使用MinerU处理文档
|
||||
maxkb_logger.info(f"Using MinerU to reprocess document: {document.name}")
|
||||
paragraphs = mineru_handler.handle(
|
||||
temp_file,
|
||||
[], # pattern_list
|
||||
False, # with_filter
|
||||
0, # limit (0表示不限制)
|
||||
get_buffer,
|
||||
save_image,
|
||||
llm_model_id=llm_model_id,
|
||||
vision_model_id=vision_model_id
|
||||
)
|
||||
|
||||
if paragraphs and len(paragraphs) > 0:
|
||||
# 创建新的段落
|
||||
paragraph_model_list = []
|
||||
for index, paragraph in enumerate(paragraphs):
|
||||
paragraph_instance = Paragraph(
|
||||
id=uuid_lib.uuid4(),
|
||||
document_id=document_id,
|
||||
knowledge_id=knowledge_id,
|
||||
content=paragraph.get('content', ''),
|
||||
title=paragraph.get('title', ''),
|
||||
status=get_default_status(),
|
||||
is_active=True,
|
||||
hit_num=0,
|
||||
position=index + 1
|
||||
)
|
||||
if 'image_list' in paragraph:
|
||||
paragraph_instance.image_list = paragraph['image_list']
|
||||
paragraph_model_list.append(paragraph_instance)
|
||||
|
||||
# 批量插入段落
|
||||
QuerySet(Paragraph).bulk_create(paragraph_model_list)
|
||||
|
||||
# 更新文档字符数
|
||||
char_length = sum([len(p.content) for p in paragraph_model_list])
|
||||
document.char_length = char_length
|
||||
document.save()
|
||||
|
||||
# MinerU解析完成,启动向量化任务
|
||||
embedding_model_id = knowledge.embedding_model_id
|
||||
maxkb_logger.info(f"Starting embedding for document {document_id} after MinerU parsing")
|
||||
|
||||
# 调用向量化任务,此时embedding_by_document会自动将状态从PARSING更新为STARTED
|
||||
embedding_by_document.delay(
|
||||
str(document_id),
|
||||
str(embedding_model_id)
|
||||
)
|
||||
|
||||
maxkb_logger.info(f"MinerU reprocessing completed for document: {document.name}, "
|
||||
f"created {len(paragraph_model_list)} paragraphs")
|
||||
else:
|
||||
maxkb_logger.warning(f"MinerU returned no paragraphs for document: {document.name}")
|
||||
# 更新状态为失败
|
||||
ListenerManagement.update_status(
|
||||
QuerySet(Document).filter(id=document_id),
|
||||
TaskType.EMBEDDING,
|
||||
State.FAILURE
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
maxkb_logger.error(f"Failed to process document {document_id}: {str(e)}", exc_info=True)
|
||||
# 更新状态为失败
|
||||
ListenerManagement.update_status(
|
||||
QuerySet(Document).filter(id=document_id),
|
||||
TaskType.EMBEDDING,
|
||||
State.FAILURE
|
||||
)
|
||||
|
||||
|
||||
@celery_app.task(name='celery:batch_advanced_learning')
|
||||
def batch_advanced_learning(document_id_list: List[str], knowledge_id: str, workspace_id: str,
|
||||
llm_model_id: str, vision_model_id: str):
|
||||
"""
|
||||
批量高级学习任务
|
||||
|
||||
@param document_id_list: 文档ID列表
|
||||
@param knowledge_id: 知识库ID
|
||||
@param workspace_id: 工作空间ID
|
||||
@param llm_model_id: 大语言模型ID
|
||||
@param vision_model_id: 视觉模型ID
|
||||
"""
|
||||
from common.utils.logger import maxkb_logger
|
||||
maxkb_logger.info(f"batch_advanced_learning called with {len(document_id_list)} documents")
|
||||
|
||||
for document_id in document_id_list:
|
||||
maxkb_logger.info(f"Submitting advanced_learning_by_document for document {document_id}")
|
||||
advanced_learning_by_document.apply_async(
|
||||
args=[str(document_id), str(knowledge_id), workspace_id, llm_model_id, vision_model_id],
|
||||
queue='celery'
|
||||
)
|
||||
@ -12,7 +12,7 @@ from common.event import ListenerManagement
|
||||
from common.utils.logger import maxkb_logger
|
||||
from common.utils.page_utils import page, page_desc
|
||||
from knowledge.models import Paragraph, Document, Status, TaskType, State
|
||||
from knowledge.task.handler import save_problem
|
||||
from knowledge.tasks.handler import save_problem
|
||||
from models_provider.models import Model
|
||||
from models_provider.tools import get_model
|
||||
from ops import celery_app
|
||||
@ -23,7 +23,7 @@ from ops import celery_app
|
||||
|
||||
@celery_app.task(base=QueueOnce, once={'keys': ['knowledge_id']}, name='celery:sync_web_knowledge')
|
||||
def sync_web_knowledge(knowledge_id: str, url: str, selector: str):
|
||||
from knowledge.task.handler import get_save_handler
|
||||
from knowledge.tasks.handler import get_save_handler
|
||||
|
||||
try:
|
||||
maxkb_logger.info(
|
||||
@ -40,7 +40,7 @@ def sync_web_knowledge(knowledge_id: str, url: str, selector: str):
|
||||
|
||||
@celery_app.task(base=QueueOnce, once={'keys': ['knowledge_id']}, name='celery:sync_replace_web_knowledge')
|
||||
def sync_replace_web_knowledge(knowledge_id: str, url: str, selector: str):
|
||||
from knowledge.task.handler import get_sync_handler
|
||||
from knowledge.tasks.handler import get_sync_handler
|
||||
|
||||
try:
|
||||
maxkb_logger.info(
|
||||
@ -56,7 +56,7 @@ def sync_replace_web_knowledge(knowledge_id: str, url: str, selector: str):
|
||||
|
||||
@celery_app.task(name='celery:sync_web_document')
|
||||
def sync_web_document(knowledge_id, source_url_list: List[str], selector: str):
|
||||
from knowledge.task.handler import get_sync_web_document_handler
|
||||
from knowledge.tasks.handler import get_sync_web_document_handler
|
||||
|
||||
handler = get_sync_web_document_handler(knowledge_id)
|
||||
for source_url in source_url_list:
|
||||
@ -26,6 +26,7 @@ urlpatterns = [
|
||||
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_sync', views.DocumentView.BatchSync.as_view()),
|
||||
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_delete', views.DocumentView.BatchDelete.as_view()),
|
||||
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_refresh', views.DocumentView.BatchRefresh.as_view()),
|
||||
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_advanced_learning', views.BatchAdvancedLearning.as_view()),
|
||||
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch_generate_related', views.DocumentView.BatchGenerateRelated.as_view()),
|
||||
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/web', views.WebDocumentView.as_view()),
|
||||
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/qa', views.QaDocumentView.as_view()),
|
||||
|
||||
@ -13,7 +13,7 @@ from knowledge.api.document import DocumentSplitAPI, DocumentBatchAPI, DocumentB
|
||||
DocumentReadAPI, DocumentEditAPI, DocumentDeleteAPI, TableDocumentCreateAPI, QaDocumentCreateAPI, \
|
||||
WebDocumentCreateAPI, CancelTaskAPI, BatchCancelTaskAPI, SyncWebAPI, RefreshAPI, BatchEditHitHandlingAPI, \
|
||||
DocumentTreeReadAPI, DocumentSplitPatternAPI, BatchRefreshAPI, BatchGenerateRelatedAPI, TemplateExportAPI, \
|
||||
DocumentExportAPI, DocumentMigrateAPI, DocumentDownloadSourceAPI
|
||||
DocumentExportAPI, DocumentMigrateAPI, DocumentDownloadSourceAPI, BatchAdvancedLearningAPI
|
||||
from knowledge.serializers.common import get_knowledge_operation_object
|
||||
from knowledge.serializers.document import DocumentSerializers
|
||||
from knowledge.views.common import get_knowledge_document_operation_object, get_document_operation_object_batch, \
|
||||
@ -817,3 +817,39 @@ class TableTemplate(APIView):
|
||||
tags=[_('Knowledge Base/Documentation')]) # type: ignore
|
||||
def get(self, request: Request):
|
||||
return DocumentSerializers.Export(data={'type': request.query_params.get('type')}).table_export(with_valid=True)
|
||||
|
||||
|
||||
class BatchAdvancedLearning(APIView):
|
||||
authentication_classes = [TokenAuth]
|
||||
|
||||
@extend_schema(
|
||||
methods=['PUT'],
|
||||
summary=_('Batch advanced learning with MinerU'),
|
||||
operation_id=_('Batch advanced learning with MinerU'), # type: ignore
|
||||
request=BatchAdvancedLearningAPI.get_request(),
|
||||
parameters=BatchAdvancedLearningAPI.get_parameters(),
|
||||
responses=BatchAdvancedLearningAPI.get_response(),
|
||||
tags=[_('Knowledge Base/Documentation')] # type: ignore
|
||||
)
|
||||
@has_permissions(
|
||||
PermissionConstants.KNOWLEDGE_DOCUMENT_VECTOR.get_workspace_knowledge_permission(),
|
||||
PermissionConstants.KNOWLEDGE_DOCUMENT_VECTOR.get_workspace_permission_workspace_manage_role(),
|
||||
PermissionConstants.KNOWLEDGE_DOCUMENT_EDIT.get_workspace_knowledge_permission(),
|
||||
PermissionConstants.KNOWLEDGE_DOCUMENT_EDIT.get_workspace_permission_workspace_manage_role(),
|
||||
RoleConstants.WORKSPACE_MANAGE.get_workspace_role(),
|
||||
ViewPermission([RoleConstants.USER.get_workspace_role()],
|
||||
[PermissionConstants.KNOWLEDGE.get_workspace_knowledge_permission()], CompareConstants.AND),
|
||||
)
|
||||
@log(
|
||||
menu='document', operate="Batch advanced learning",
|
||||
get_operation_object=lambda r, keywords: get_knowledge_document_operation_object(
|
||||
get_knowledge_operation_object(keywords.get('knowledge_id')),
|
||||
get_document_operation_object_batch(r.data.get('id_list')),
|
||||
),
|
||||
)
|
||||
def put(self, request: Request, workspace_id: str, knowledge_id: str):
|
||||
return result.success(
|
||||
DocumentSerializers.BatchAdvancedLearning(
|
||||
data={'workspace_id': workspace_id, 'knowledge_id': knowledge_id}
|
||||
).batch_advanced_learning(request.data)
|
||||
)
|
||||
|
||||
@ -0,0 +1,79 @@
|
||||
# coding=utf-8
|
||||
"""
|
||||
@project: MaxKB
|
||||
@Author:MaxKB Team
|
||||
@file: image.py
|
||||
@date:2024/12/20
|
||||
@desc: Kimi Image/Vision Model Credential
|
||||
"""
|
||||
import traceback
|
||||
from typing import Dict
|
||||
|
||||
from django.utils.translation import gettext_lazy as _, gettext
|
||||
from langchain_core.messages import HumanMessage
|
||||
|
||||
from common import forms
|
||||
from common.exception.app_exception import AppApiException
|
||||
from common.forms import BaseForm, TooltipLabel
|
||||
from models_provider.base_model_provider import BaseModelCredential, ValidCode
|
||||
|
||||
|
||||
class KimiImageModelParams(BaseForm):
|
||||
temperature = forms.SliderField(TooltipLabel(_('Temperature'),
|
||||
_('Higher values make the output more random, while lower values make it more focused and deterministic')),
|
||||
required=True, default_value=0.3,
|
||||
_min=0.1,
|
||||
_max=1.0,
|
||||
_step=0.01,
|
||||
precision=2)
|
||||
|
||||
max_tokens = forms.SliderField(
|
||||
TooltipLabel(_('Output the maximum Tokens'),
|
||||
_('Specify the maximum number of tokens that the model can generate')),
|
||||
required=True, default_value=1024,
|
||||
_min=1,
|
||||
_max=100000,
|
||||
_step=1,
|
||||
precision=0)
|
||||
|
||||
|
||||
class KimiImageModelCredential(BaseForm, BaseModelCredential):
|
||||
api_base = forms.TextInputField('API URL', required=True)
|
||||
api_key = forms.PasswordInputField('API Key', required=True)
|
||||
|
||||
def is_valid(self, model_type: str, model_name, model_credential: Dict[str, object], model_params, provider,
|
||||
raise_exception=False):
|
||||
model_type_list = provider.get_model_type_list()
|
||||
if not any(list(filter(lambda mt: mt.get('value') == model_type, model_type_list))):
|
||||
raise AppApiException(ValidCode.valid_error.value,
|
||||
gettext('{model_type} Model type is not supported').format(model_type=model_type))
|
||||
|
||||
for key in ['api_base', 'api_key']:
|
||||
if key not in model_credential:
|
||||
if raise_exception:
|
||||
raise AppApiException(ValidCode.valid_error.value, gettext('{key} is required').format(key=key))
|
||||
else:
|
||||
return False
|
||||
try:
|
||||
model = provider.get_model(model_type, model_name, model_credential, **model_params)
|
||||
res = model.stream([HumanMessage(content=[{"type": "text", "text": gettext('Hello')}])])
|
||||
for chunk in res:
|
||||
pass # Just consume the stream to validate
|
||||
except Exception as e:
|
||||
traceback.print_exc()
|
||||
if isinstance(e, AppApiException):
|
||||
raise e
|
||||
if raise_exception:
|
||||
raise AppApiException(ValidCode.valid_error.value,
|
||||
gettext(
|
||||
'Verification failed, please check whether the parameters are correct: {error}').format(
|
||||
error=str(e)))
|
||||
else:
|
||||
return False
|
||||
return True
|
||||
|
||||
def encryption_dict(self, model: Dict[str, object]):
|
||||
return {**model, 'api_key': super().encryption(model.get('api_key', ''))}
|
||||
|
||||
def get_model_params_setting_form(self, model_name):
|
||||
return KimiImageModelParams()
|
||||
@ -12,11 +12,15 @@ from common.utils.common import get_file_content
|
||||
from models_provider.base_model_provider import IModelProvider, ModelProvideInfo, ModelInfo, \
|
||||
ModelTypeConst, ModelInfoManage
|
||||
from models_provider.impl.kimi_model_provider.credential.llm import KimiLLMModelCredential
|
||||
from models_provider.impl.kimi_model_provider.credential.image import KimiImageModelCredential
|
||||
from models_provider.impl.kimi_model_provider.model.llm import KimiChatModel
|
||||
from models_provider.impl.kimi_model_provider.model.image import KimiImageModel
|
||||
from maxkb.conf import PROJECT_DIR
|
||||
|
||||
kimi_llm_model_credential = KimiLLMModelCredential()
|
||||
kimi_image_model_credential = KimiImageModelCredential()
|
||||
|
||||
# LLM Models
|
||||
moonshot_v1_8k = ModelInfo('moonshot-v1-8k', '', ModelTypeConst.LLM, kimi_llm_model_credential,
|
||||
KimiChatModel)
|
||||
moonshot_v1_32k = ModelInfo('moonshot-v1-32k', '', ModelTypeConst.LLM, kimi_llm_model_credential,
|
||||
@ -24,8 +28,24 @@ moonshot_v1_32k = ModelInfo('moonshot-v1-32k', '', ModelTypeConst.LLM, kimi_llm_
|
||||
moonshot_v1_128k = ModelInfo('moonshot-v1-128k', '', ModelTypeConst.LLM, kimi_llm_model_credential,
|
||||
KimiChatModel)
|
||||
|
||||
model_info_manage = ModelInfoManage.builder().append_model_info(moonshot_v1_8k).append_model_info(
|
||||
moonshot_v1_32k).append_default_model_info(moonshot_v1_128k).append_default_model_info(moonshot_v1_8k).build()
|
||||
# Vision/Image Models
|
||||
moonshot_v1_8k_vision = ModelInfo('moonshot-v1-8k-vision-preview', 'Kimi 视觉模型 8K', ModelTypeConst.IMAGE,
|
||||
kimi_image_model_credential, KimiImageModel)
|
||||
moonshot_v1_32k_vision = ModelInfo('moonshot-v1-32k-vision-preview', 'Kimi 视觉模型 32K', ModelTypeConst.IMAGE,
|
||||
kimi_image_model_credential, KimiImageModel)
|
||||
moonshot_v1_128k_vision = ModelInfo('moonshot-v1-128k-vision-preview', 'Kimi 视觉模型 128K', ModelTypeConst.IMAGE,
|
||||
kimi_image_model_credential, KimiImageModel)
|
||||
|
||||
model_info_manage = (ModelInfoManage.builder()
|
||||
.append_model_info(moonshot_v1_8k)
|
||||
.append_model_info(moonshot_v1_32k)
|
||||
.append_default_model_info(moonshot_v1_128k)
|
||||
.append_default_model_info(moonshot_v1_8k)
|
||||
.append_model_info(moonshot_v1_8k_vision)
|
||||
.append_model_info(moonshot_v1_32k_vision)
|
||||
.append_model_info(moonshot_v1_128k_vision)
|
||||
.append_default_model_info(moonshot_v1_128k_vision)
|
||||
.build())
|
||||
|
||||
|
||||
class KimiModelProvider(IModelProvider):
|
||||
|
||||
32
apps/models_provider/impl/kimi_model_provider/model/image.py
Normal file
32
apps/models_provider/impl/kimi_model_provider/model/image.py
Normal file
@ -0,0 +1,32 @@
|
||||
# coding=utf-8
|
||||
"""
|
||||
@project: MaxKB
|
||||
@Author:MaxKB Team
|
||||
@file: image.py
|
||||
@date:2024/12/20
|
||||
@desc: Kimi Image/Vision Model Implementation
|
||||
"""
|
||||
from typing import Dict
|
||||
|
||||
from models_provider.base_model_provider import MaxKBBaseModel
|
||||
from models_provider.impl.base_chat_open_ai import BaseChatOpenAI
|
||||
|
||||
|
||||
class KimiImageModel(MaxKBBaseModel, BaseChatOpenAI):
|
||||
|
||||
@staticmethod
|
||||
def is_cache_model():
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def new_instance(model_type, model_name, model_credential: Dict[str, object], **model_kwargs):
|
||||
optional_params = MaxKBBaseModel.filter_optional_params(model_kwargs)
|
||||
|
||||
return KimiImageModel(
|
||||
model_name=model_name,
|
||||
openai_api_base=model_credential.get('api_base'),
|
||||
openai_api_key=model_credential.get('api_key'),
|
||||
streaming=True,
|
||||
stream_usage=True,
|
||||
extra_body=optional_params
|
||||
)
|
||||
@ -7,3 +7,15 @@
|
||||
@desc:
|
||||
"""
|
||||
from .celery import app as celery_app
|
||||
|
||||
# Import and register advanced learning tasks
|
||||
try:
|
||||
from knowledge.tasks.advanced_learning import (
|
||||
advanced_learning_by_document,
|
||||
batch_advanced_learning
|
||||
)
|
||||
# Register tasks with the celery app
|
||||
celery_app.register_task(advanced_learning_by_document)
|
||||
celery_app.register_task(batch_advanced_learning)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
67
dev/Makefile
Normal file
67
dev/Makefile
Normal file
@ -0,0 +1,67 @@
|
||||
# MaxKB 开发环境快捷命令
|
||||
|
||||
# 快速重启Python进程(最常用)
|
||||
restart:
|
||||
@echo "🔄 重启MaxKB Python进程..."
|
||||
@docker exec maxkb-dev pkill -f "python.*main.py" 2>/dev/null || true
|
||||
@sleep 2
|
||||
@echo "✅ 重启完成"
|
||||
|
||||
# 查看实时日志
|
||||
logs:
|
||||
docker logs -f maxkb-dev
|
||||
|
||||
# 查看最新20行日志
|
||||
log:
|
||||
docker logs maxkb-dev --tail 20
|
||||
|
||||
# 进入容器shell
|
||||
shell:
|
||||
docker exec -it maxkb-dev bash
|
||||
|
||||
# 查看Python进程
|
||||
ps:
|
||||
@docker exec maxkb-dev ps aux | grep python | grep -v grep || echo "没有找到Python进程"
|
||||
|
||||
# 完全重启容器
|
||||
full-restart:
|
||||
docker compose -f docker-compose-simple.yml restart
|
||||
|
||||
# 停止容器
|
||||
stop:
|
||||
docker compose -f docker-compose-simple.yml stop
|
||||
|
||||
# 启动容器
|
||||
start:
|
||||
docker compose -f docker-compose-simple.yml start
|
||||
|
||||
# 重新构建并启动
|
||||
rebuild:
|
||||
docker compose -f docker-compose-simple.yml down
|
||||
docker compose -f docker-compose-simple.yml up -d --build
|
||||
|
||||
# 查看容器状态
|
||||
status:
|
||||
@docker ps --filter name=maxkb-dev --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
|
||||
|
||||
# 清理日志
|
||||
clean-logs:
|
||||
@docker exec maxkb-dev bash -c "find /opt/maxkb-app -name '*.log' -type f -exec truncate -s 0 {} \;" 2>/dev/null || true
|
||||
@echo "日志已清理"
|
||||
|
||||
# 帮助信息
|
||||
help:
|
||||
@echo "MaxKB 开发环境快捷命令:"
|
||||
@echo " make restart - 快速重启Python进程(最快)"
|
||||
@echo " make logs - 查看实时日志"
|
||||
@echo " make log - 查看最新20行日志"
|
||||
@echo " make shell - 进入容器Shell"
|
||||
@echo " make ps - 查看Python进程"
|
||||
@echo " make full-restart - 完全重启容器"
|
||||
@echo " make stop - 停止容器"
|
||||
@echo " make start - 启动容器"
|
||||
@echo " make rebuild - 重新构建并启动"
|
||||
@echo " make status - 查看容器状态"
|
||||
@echo " make clean-logs - 清理日志文件"
|
||||
|
||||
.DEFAULT_GOAL := help
|
||||
281
dev/sync_mineru_dirs.py
Executable file
281
dev/sync_mineru_dirs.py
Executable file
@ -0,0 +1,281 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
双向同步脚本:在两个 mineru 目录之间进行文件同步
|
||||
目录1: ~/Documents/felo/gptbase-parser/loader/mineru
|
||||
目录2: /Users/moshui/Documents/felo/moshui/MaxKB/apps/common/handle/impl/mineru
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import shutil
|
||||
import hashlib
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Set, Tuple, Optional
|
||||
import subprocess
|
||||
|
||||
try:
|
||||
from watchdog.observers import Observer
|
||||
from watchdog.events import FileSystemEventHandler
|
||||
WATCHDOG_AVAILABLE = True
|
||||
except ImportError:
|
||||
WATCHDOG_AVAILABLE = False
|
||||
print("警告: watchdog 未安装,实时监控功能不可用")
|
||||
print("运行 'pip install watchdog' 来启用实时监控功能")
|
||||
|
||||
class DirectorySyncer:
|
||||
def __init__(self, dir1: str, dir2: str, verbose: bool = False):
|
||||
self.dir1 = Path(dir1).expanduser().resolve()
|
||||
self.dir2 = Path(dir2).expanduser().resolve()
|
||||
self.verbose = verbose
|
||||
self.exclude_patterns = {
|
||||
'__pycache__',
|
||||
'.DS_Store',
|
||||
'*.pyc',
|
||||
'*.pyo',
|
||||
'.git',
|
||||
'.idea',
|
||||
'.vscode',
|
||||
'*.swp',
|
||||
'*.swo',
|
||||
'*~'
|
||||
}
|
||||
|
||||
def should_exclude(self, path: Path) -> bool:
|
||||
"""检查文件或目录是否应该被排除"""
|
||||
name = path.name
|
||||
|
||||
for pattern in self.exclude_patterns:
|
||||
if pattern.startswith('*'):
|
||||
if name.endswith(pattern[1:]):
|
||||
return True
|
||||
elif pattern.endswith('*'):
|
||||
if name.startswith(pattern[:-1]):
|
||||
return True
|
||||
elif name == pattern:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def get_file_hash(self, filepath: Path) -> str:
|
||||
"""计算文件的 MD5 哈希值"""
|
||||
hash_md5 = hashlib.md5()
|
||||
try:
|
||||
with open(filepath, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(4096), b""):
|
||||
hash_md5.update(chunk)
|
||||
return hash_md5.hexdigest()
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"无法计算 {filepath} 的哈希值: {e}")
|
||||
return ""
|
||||
|
||||
def get_relative_files(self, directory: Path) -> Set[Path]:
|
||||
"""获取目录中所有文件的相对路径集合"""
|
||||
files = set()
|
||||
for item in directory.rglob("*"):
|
||||
if self.should_exclude(item):
|
||||
continue
|
||||
if item.is_file():
|
||||
relative_path = item.relative_to(directory)
|
||||
files.add(relative_path)
|
||||
return files
|
||||
|
||||
def sync_file(self, source: Path, dest: Path, relative_path: Path) -> bool:
|
||||
"""同步单个文件"""
|
||||
source_file = source / relative_path
|
||||
dest_file = dest / relative_path
|
||||
|
||||
try:
|
||||
dest_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if not dest_file.exists():
|
||||
shutil.copy2(source_file, dest_file)
|
||||
if self.verbose:
|
||||
print(f"复制: {relative_path}")
|
||||
return True
|
||||
else:
|
||||
source_hash = self.get_file_hash(source_file)
|
||||
dest_hash = self.get_file_hash(dest_file)
|
||||
|
||||
if source_hash != dest_hash:
|
||||
source_mtime = source_file.stat().st_mtime
|
||||
dest_mtime = dest_file.stat().st_mtime
|
||||
|
||||
if source_mtime > dest_mtime:
|
||||
shutil.copy2(source_file, dest_file)
|
||||
if self.verbose:
|
||||
print(f"更新: {relative_path} (源文件较新)")
|
||||
return True
|
||||
elif self.verbose:
|
||||
print(f"跳过: {relative_path} (目标文件较新或相同)")
|
||||
|
||||
except Exception as e:
|
||||
print(f"错误同步 {relative_path}: {e}")
|
||||
return False
|
||||
|
||||
return False
|
||||
|
||||
def sync_directories(self) -> Tuple[int, int]:
|
||||
"""执行双向同步"""
|
||||
print(f"\n开始同步...")
|
||||
print(f"目录1: {self.dir1}")
|
||||
print(f"目录2: {self.dir2}")
|
||||
print("-" * 60)
|
||||
|
||||
files1 = self.get_relative_files(self.dir1)
|
||||
files2 = self.get_relative_files(self.dir2)
|
||||
|
||||
all_files = files1 | files2
|
||||
synced_count = 0
|
||||
deleted_count = 0
|
||||
|
||||
for rel_path in all_files:
|
||||
file1 = self.dir1 / rel_path
|
||||
file2 = self.dir2 / rel_path
|
||||
|
||||
if file1.exists() and not file2.exists():
|
||||
if self.sync_file(self.dir1, self.dir2, rel_path):
|
||||
synced_count += 1
|
||||
print(f"→ {rel_path}")
|
||||
|
||||
elif file2.exists() and not file1.exists():
|
||||
if self.sync_file(self.dir2, self.dir1, rel_path):
|
||||
synced_count += 1
|
||||
print(f"← {rel_path}")
|
||||
|
||||
elif file1.exists() and file2.exists():
|
||||
hash1 = self.get_file_hash(file1)
|
||||
hash2 = self.get_file_hash(file2)
|
||||
|
||||
if hash1 != hash2:
|
||||
mtime1 = file1.stat().st_mtime
|
||||
mtime2 = file2.stat().st_mtime
|
||||
|
||||
if mtime1 > mtime2:
|
||||
if self.sync_file(self.dir1, self.dir2, rel_path):
|
||||
synced_count += 1
|
||||
print(f"→ {rel_path} (更新)")
|
||||
else:
|
||||
if self.sync_file(self.dir2, self.dir1, rel_path):
|
||||
synced_count += 1
|
||||
print(f"← {rel_path} (更新)")
|
||||
|
||||
return synced_count, deleted_count
|
||||
|
||||
def watch(self):
|
||||
"""启动文件监控"""
|
||||
if not WATCHDOG_AVAILABLE:
|
||||
print("错误: watchdog 模块未安装")
|
||||
print("请运行: pip install watchdog")
|
||||
return
|
||||
|
||||
class SyncHandler(FileSystemEventHandler):
|
||||
def __init__(self, syncer):
|
||||
self.syncer = syncer
|
||||
self.last_sync = 0
|
||||
self.sync_delay = 1
|
||||
|
||||
def on_any_event(self, event):
|
||||
if event.is_directory:
|
||||
return
|
||||
|
||||
current_time = time.time()
|
||||
if current_time - self.last_sync > self.sync_delay:
|
||||
path = Path(event.src_path)
|
||||
if not self.syncer.should_exclude(path):
|
||||
print(f"\n检测到变化: {path.name}")
|
||||
self.syncer.sync_directories()
|
||||
self.last_sync = current_time
|
||||
|
||||
event_handler = SyncHandler(self)
|
||||
observer = Observer()
|
||||
observer.schedule(event_handler, str(self.dir1), recursive=True)
|
||||
observer.schedule(event_handler, str(self.dir2), recursive=True)
|
||||
observer.start()
|
||||
|
||||
print(f"\n监控模式已启动...")
|
||||
print(f"正在监控两个目录的变化,按 Ctrl+C 退出")
|
||||
print("-" * 60)
|
||||
|
||||
try:
|
||||
while True:
|
||||
time.sleep(1)
|
||||
except KeyboardInterrupt:
|
||||
observer.stop()
|
||||
print("\n监控已停止")
|
||||
observer.join()
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="双向同步两个 mineru 目录",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
示例:
|
||||
%(prog)s # 执行一次同步
|
||||
%(prog)s --watch # 启动实时监控模式
|
||||
%(prog)s --verbose # 显示详细信息
|
||||
%(prog)s --dry-run # 模拟运行,不实际同步
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--watch", "-w",
|
||||
action="store_true",
|
||||
help="启动监控模式,实时同步文件变化"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--verbose", "-v",
|
||||
action="store_true",
|
||||
help="显示详细输出"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dry-run", "-n",
|
||||
action="store_true",
|
||||
help="模拟运行,只显示将要执行的操作"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dir1",
|
||||
default="~/Documents/felo/gptbase-parser/loader/mineru",
|
||||
help="第一个目录路径 (默认: ~/Documents/felo/gptbase-parser/loader/mineru)"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--dir2",
|
||||
default="/Users/moshui/Documents/felo/moshui/MaxKB/apps/common/handle/impl/mineru",
|
||||
help="第二个目录路径"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
syncer = DirectorySyncer(args.dir1, args.dir2, verbose=args.verbose)
|
||||
|
||||
if not syncer.dir1.exists():
|
||||
print(f"错误: 目录不存在 - {syncer.dir1}")
|
||||
sys.exit(1)
|
||||
|
||||
if not syncer.dir2.exists():
|
||||
print(f"错误: 目录不存在 - {syncer.dir2}")
|
||||
sys.exit(1)
|
||||
|
||||
if args.dry_run:
|
||||
print("模拟运行模式 - 不会实际修改文件")
|
||||
syncer.verbose = True
|
||||
|
||||
if args.watch:
|
||||
syncer.sync_directories()
|
||||
syncer.watch()
|
||||
else:
|
||||
synced, deleted = syncer.sync_directories()
|
||||
print("-" * 60)
|
||||
print(f"同步完成: {synced} 个文件已同步")
|
||||
if deleted > 0:
|
||||
print(f" {deleted} 个文件已删除")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -361,6 +361,35 @@ const putBatchRefresh: (
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* 批量高级学习(重新解析)
|
||||
* @param 参数 knowledge_id,
|
||||
*{
|
||||
"id_list": [
|
||||
"3fa85f64-5717-4562-b3fc-2c963f66afa6"
|
||||
],
|
||||
"llm_model": "model_id",
|
||||
"vision_model": "model_id"
|
||||
}
|
||||
*/
|
||||
const putBatchAdvancedLearning: (
|
||||
knowledge_id: string,
|
||||
idList: string[],
|
||||
models: { llmModel: string; visionModel: string },
|
||||
loading?: Ref<boolean>,
|
||||
) => Promise<Result<boolean>> = (knowledge_id, idList, models, loading) => {
|
||||
return put(
|
||||
`${prefix.value}/${knowledge_id}/document/batch_advanced_learning`,
|
||||
{
|
||||
id_list: idList,
|
||||
llm_model: models.llmModel,
|
||||
vision_model: models.visionModel,
|
||||
},
|
||||
undefined,
|
||||
loading,
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* 批量同步文档
|
||||
* @param 参数 knowledge_id,
|
||||
@ -582,6 +611,7 @@ export default {
|
||||
putBatchGenerateRelated,
|
||||
putBatchEditHitHandling,
|
||||
putBatchRefresh,
|
||||
putBatchAdvancedLearning,
|
||||
putMulSyncDocument,
|
||||
putMigrateMulDocument,
|
||||
postQADocument,
|
||||
|
||||
@ -111,4 +111,5 @@ export default {
|
||||
copyTitle: '副本',
|
||||
professional: '购买专业版',
|
||||
sync: '同步',
|
||||
shared: '共享',
|
||||
}
|
||||
|
||||
@ -42,7 +42,7 @@ export default {
|
||||
|
||||
fileType: {
|
||||
txt: {
|
||||
label: '文本文件',
|
||||
label: '普通学习',
|
||||
tip1: '1、文件上传前,建议规范文件的分段标识',
|
||||
},
|
||||
table: {
|
||||
@ -97,6 +97,7 @@ export default {
|
||||
name: '文件名称',
|
||||
char_length: '字符数',
|
||||
paragraph: '分段',
|
||||
learningType: '学习方式',
|
||||
all: '全部',
|
||||
updateTime: '更新时间',
|
||||
},
|
||||
@ -109,6 +110,7 @@ export default {
|
||||
GENERATE: '生成中',
|
||||
SYNC: '同步中',
|
||||
REVOKE: '取消中',
|
||||
PARSING: '学习中',
|
||||
finish: '完成',
|
||||
},
|
||||
enableStatus: {
|
||||
@ -181,4 +183,23 @@ export default {
|
||||
allCheck: '全选',
|
||||
errorMessage1: '请选择文档',
|
||||
},
|
||||
advancedLearning: {
|
||||
title: '高级学习',
|
||||
button: '高级学习',
|
||||
llmModel: '大语言模型',
|
||||
visionModel: '视觉模型',
|
||||
selectLlmModel: '请选择大语言模型',
|
||||
selectVisionModel: '请选择视觉模型',
|
||||
llmModelRequired: '请选择大语言模型',
|
||||
visionModelRequired: '请选择视觉模型',
|
||||
tip1: '高级学习提供高质量的 PDF 和 PPT 文档解析,支持复杂表格、图片、公式等内容',
|
||||
tip2: '原有段落和索引将被删除,文档将使用高级学习重新解析',
|
||||
tip3: '解析过程可能需要较长时间,请耐心等待',
|
||||
successMessage: '高级学习任务已启动',
|
||||
loadModelsFailed: '加载模型列表失败',
|
||||
},
|
||||
learningType: {
|
||||
regular: '普通学习',
|
||||
advanced: '高级学习',
|
||||
},
|
||||
}
|
||||
|
||||
@ -20,6 +20,8 @@ interface StateInterface {
|
||||
REVOKE: '4'
|
||||
// 取消成功
|
||||
REVOKED: '5'
|
||||
// 解析中
|
||||
PARSING: '6'
|
||||
IGNORED: 'n'
|
||||
}
|
||||
const TaskType: TaskTypeInterface = {
|
||||
@ -40,6 +42,8 @@ const State: StateInterface = {
|
||||
REVOKE: '4',
|
||||
// 取消成功
|
||||
REVOKED: '5',
|
||||
// 解析中
|
||||
PARSING: '6',
|
||||
IGNORED: 'n'
|
||||
}
|
||||
class Status {
|
||||
|
||||
185
ui/src/views/document/component/AdvancedLearningDialog.vue
Normal file
185
ui/src/views/document/component/AdvancedLearningDialog.vue
Normal file
@ -0,0 +1,185 @@
|
||||
<template>
|
||||
<el-dialog
|
||||
v-model="dialogVisible"
|
||||
:title="$t('views.document.advancedLearning.title')"
|
||||
:before-close="close"
|
||||
width="630px"
|
||||
>
|
||||
<el-form :model="form" :rules="rules" ref="formRef" label-position="top" class="form-container">
|
||||
<el-form-item :label="$t('views.document.advancedLearning.llmModel')" prop="llmModel">
|
||||
<el-select
|
||||
v-model="form.llmModel"
|
||||
:placeholder="$t('views.document.advancedLearning.selectLlmModel')"
|
||||
class="w-full"
|
||||
clearable
|
||||
>
|
||||
<el-option
|
||||
v-for="model in llmModels"
|
||||
:key="model.id"
|
||||
:label="model.name"
|
||||
:value="model.id"
|
||||
>
|
||||
<span>{{ model.name }}</span>
|
||||
<el-tag v-if="model.type === 'share'" size="small" class="ml-8">{{
|
||||
$t('common.shared')
|
||||
}}</el-tag>
|
||||
</el-option>
|
||||
</el-select>
|
||||
</el-form-item>
|
||||
<el-form-item :label="$t('views.document.advancedLearning.visionModel')" prop="visionModel">
|
||||
<el-select
|
||||
v-model="form.visionModel"
|
||||
:placeholder="$t('views.document.advancedLearning.selectVisionModel')"
|
||||
class="w-full"
|
||||
clearable
|
||||
>
|
||||
<el-option
|
||||
v-for="model in visionModels"
|
||||
:key="model.id"
|
||||
:label="model.name"
|
||||
:value="model.id"
|
||||
>
|
||||
<span>{{ model.name }}</span>
|
||||
<el-tag v-if="model.type === 'share'" size="small" class="ml-8">{{
|
||||
$t('common.shared')
|
||||
}}</el-tag>
|
||||
</el-option>
|
||||
</el-select>
|
||||
</el-form-item>
|
||||
<div class="update-info flex border-r-6 mb-16" style="padding: 8px 12px;">
|
||||
<div class="mt-4">
|
||||
<AppIcon iconName="app-warning-colorful" style="font-size: 16px"></AppIcon>
|
||||
</div>
|
||||
<div class="ml-16 lighter" style="padding-right: 12px;">
|
||||
<p>{{ $t('views.document.advancedLearning.tip1') }}</p>
|
||||
<p>{{ $t('views.document.advancedLearning.tip2') }}</p>
|
||||
<p>{{ $t('views.document.advancedLearning.tip3') }}</p>
|
||||
</div>
|
||||
</div>
|
||||
</el-form>
|
||||
<template #footer>
|
||||
<div class="dialog-footer">
|
||||
<el-button @click="close">{{ $t('common.cancel') }}</el-button>
|
||||
<el-button type="primary" @click="submit" :loading="loading">
|
||||
{{ $t('common.submit') }}
|
||||
</el-button>
|
||||
</div>
|
||||
</template>
|
||||
</el-dialog>
|
||||
</template>
|
||||
|
||||
<script setup lang="ts">
|
||||
import { ref, reactive } from 'vue'
|
||||
import modelApi from '@/api/model/model'
|
||||
import type { FormInstance, FormRules } from 'element-plus'
|
||||
import { MsgError } from '@/utils/message'
|
||||
import { useI18n } from 'vue-i18n'
|
||||
|
||||
const { t } = useI18n()
|
||||
|
||||
const dialogVisible = ref<boolean>(false)
|
||||
const loading = ref<boolean>(false)
|
||||
const formRef = ref<FormInstance>()
|
||||
|
||||
const form = reactive({
|
||||
llmModel: null as string | null,
|
||||
visionModel: null as string | null,
|
||||
})
|
||||
|
||||
const rules = reactive<FormRules>({
|
||||
llmModel: [
|
||||
{
|
||||
required: true,
|
||||
message: t('views.document.advancedLearning.llmModelRequired'),
|
||||
trigger: 'change',
|
||||
},
|
||||
],
|
||||
visionModel: [
|
||||
{
|
||||
required: true,
|
||||
message: t('views.document.advancedLearning.visionModelRequired'),
|
||||
trigger: 'change',
|
||||
},
|
||||
],
|
||||
})
|
||||
|
||||
const llmModels = ref<any[]>([])
|
||||
const visionModels = ref<any[]>([])
|
||||
const submit_handle = ref<(models: { llmModel: string; visionModel: string }) => void>()
|
||||
|
||||
const loadModels = async () => {
|
||||
try {
|
||||
const response = await modelApi.getSelectModelList()
|
||||
if (response.data) {
|
||||
llmModels.value = response.data.filter((m: any) => m.model_type === 'LLM')
|
||||
visionModels.value = response.data.filter(
|
||||
(m: any) => m.model_type === 'IMAGE',
|
||||
)
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Failed to load models:', error)
|
||||
MsgError(t('views.document.advancedLearning.loadModelsFailed'))
|
||||
}
|
||||
}
|
||||
|
||||
const submit = async () => {
|
||||
if (!formRef.value) return
|
||||
|
||||
await formRef.value.validate((valid) => {
|
||||
if (valid) {
|
||||
if (submit_handle.value && form.llmModel && form.visionModel) {
|
||||
loading.value = true
|
||||
submit_handle.value({
|
||||
llmModel: form.llmModel,
|
||||
visionModel: form.visionModel,
|
||||
})
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
const open = (handle: (models: { llmModel: string; visionModel: string }) => void) => {
|
||||
submit_handle.value = handle
|
||||
loadModels()
|
||||
dialogVisible.value = true
|
||||
}
|
||||
|
||||
const close = () => {
|
||||
loading.value = false
|
||||
form.llmModel = null
|
||||
form.visionModel = null
|
||||
formRef.value?.resetFields()
|
||||
submit_handle.value = undefined
|
||||
dialogVisible.value = false
|
||||
}
|
||||
|
||||
defineExpose({ open, close })
|
||||
</script>
|
||||
|
||||
<style lang="scss" scoped>
|
||||
.update-info {
|
||||
background: #fef0e6;
|
||||
color: #8a6d3b;
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
.form-container {
|
||||
width: 100%;
|
||||
|
||||
:deep(.el-form-item) {
|
||||
margin-bottom: 22px;
|
||||
}
|
||||
|
||||
:deep(.el-form-item__content) {
|
||||
width: 100%;
|
||||
}
|
||||
|
||||
:deep(.el-select) {
|
||||
width: 100% !important;
|
||||
}
|
||||
}
|
||||
|
||||
.w-full {
|
||||
width: 100% !important;
|
||||
}
|
||||
</style>
|
||||
@ -35,6 +35,10 @@
|
||||
<el-icon class="is-loading color-primary"><Loading /></el-icon>
|
||||
{{ stateMap[aggStatus.value](aggStatus.key) }}
|
||||
</el-text>
|
||||
<el-text v-else-if="aggStatus?.value === State.PARSING">
|
||||
<el-icon class="is-loading color-primary"><Loading /></el-icon>
|
||||
{{ stateMap[aggStatus.value](aggStatus.key) }}
|
||||
</el-text>
|
||||
</template>
|
||||
</el-popover>
|
||||
</template>
|
||||
@ -49,6 +53,7 @@ const checkList: Array<string> = [
|
||||
State.REVOKE,
|
||||
State.STARTED,
|
||||
State.PENDING,
|
||||
State.PARSING,
|
||||
State.FAILURE,
|
||||
State.REVOKED,
|
||||
State.SUCCESS
|
||||
@ -82,6 +87,7 @@ const stateMap: any = {
|
||||
[State.REVOKED]: (type: number) => t('views.document.fileStatus.SUCCESS'),
|
||||
[State.FAILURE]: (type: number) => t('views.document.fileStatus.FAILURE'),
|
||||
[State.SUCCESS]: (type: number) => t('views.document.fileStatus.SUCCESS'),
|
||||
[State.PARSING]: (type: number) => t('views.document.fileStatus.PARSING'),
|
||||
}
|
||||
</script>
|
||||
<style lang="scss" scoped></style>
|
||||
|
||||
@ -44,6 +44,12 @@
|
||||
v-if="permissionPrecise.doc_vector(id)"
|
||||
>{{ $t('views.knowledge.setting.vectorization') }}
|
||||
</el-button>
|
||||
<el-button
|
||||
@click="openAdvancedLearningDialog"
|
||||
:disabled="multipleSelection.length === 0"
|
||||
v-if="permissionPrecise.doc_vector(id)"
|
||||
>{{ $t('views.document.advancedLearning.button') }}
|
||||
</el-button>
|
||||
<el-button
|
||||
@click="openGenerateDialog()"
|
||||
:disabled="multipleSelection.length === 0"
|
||||
@ -239,6 +245,19 @@
|
||||
sortable
|
||||
/>
|
||||
|
||||
<el-table-column
|
||||
prop="learning_type"
|
||||
:label="$t('views.document.table.learningType')"
|
||||
align="center"
|
||||
min-width="100"
|
||||
>
|
||||
<template #default="{ row }">
|
||||
<el-tag :type="row.learning_type === 'advanced' ? 'danger' : 'success'" size="small">
|
||||
{{ $t(`views.document.learningType.${row.learning_type || 'regular'}`) }}
|
||||
</el-tag>
|
||||
</template>
|
||||
</el-table-column>
|
||||
|
||||
<el-table-column width="130">
|
||||
<template #header>
|
||||
<div>
|
||||
@ -635,6 +654,7 @@
|
||||
</div>
|
||||
|
||||
<EmbeddingContentDialog ref="embeddingContentDialogRef"></EmbeddingContentDialog>
|
||||
<AdvancedLearningDialog ref="advancedLearningDialogRef"></AdvancedLearningDialog>
|
||||
|
||||
<ImportDocumentDialog ref="ImportDocumentDialogRef" :title="title" @refresh="refresh" />
|
||||
<SyncWebDialog ref="SyncWebDialogRef" @refresh="refresh" />
|
||||
@ -662,6 +682,7 @@ import useStore from '@/stores'
|
||||
import StatusValue from '@/views/document/component/Status.vue'
|
||||
import GenerateRelatedDialog from '@/components/generate-related-dialog/index.vue'
|
||||
import EmbeddingContentDialog from '@/views/document/component/EmbeddingContentDialog.vue'
|
||||
import AdvancedLearningDialog from '@/views/document/component/AdvancedLearningDialog.vue'
|
||||
import { TaskType, State } from '@/utils/status'
|
||||
import { t } from '@/locales'
|
||||
import permissionMap from '@/permission'
|
||||
@ -746,6 +767,7 @@ const getTaskState = (status: string, taskType: number) => {
|
||||
const beforePagination = computed(() => common.paginationConfig[storeKey])
|
||||
const beforeSearch = computed(() => common.search[storeKey])
|
||||
const embeddingContentDialogRef = ref<InstanceType<typeof EmbeddingContentDialog>>()
|
||||
const advancedLearningDialogRef = ref<InstanceType<typeof AdvancedLearningDialog>>()
|
||||
const SyncWebDialogRef = ref()
|
||||
const loading = ref(false)
|
||||
let interval: any
|
||||
@ -1041,6 +1063,24 @@ function batchRefresh() {
|
||||
embeddingContentDialogRef.value?.open(embeddingBatchDocument)
|
||||
}
|
||||
|
||||
function openAdvancedLearningDialog() {
|
||||
const arr: string[] = multipleSelection.value.map((v) => v.id)
|
||||
const advancedLearningDocument = (models: { llmModel: string; visionModel: string }) => {
|
||||
loadSharedApi({ type: 'document', systemType: apiType.value })
|
||||
.putBatchAdvancedLearning(id, arr, models, loading)
|
||||
.then(() => {
|
||||
MsgSuccess(t('views.document.advancedLearning.successMessage'))
|
||||
multipleTableRef.value?.clearSelection()
|
||||
advancedLearningDialogRef.value?.close()
|
||||
getList()
|
||||
})
|
||||
.catch(() => {
|
||||
advancedLearningDialogRef.value?.close()
|
||||
})
|
||||
}
|
||||
advancedLearningDialogRef.value?.open(advancedLearningDocument)
|
||||
}
|
||||
|
||||
function downloadDocument(row: any) {
|
||||
loadSharedApi({ type: 'document', systemType: apiType.value })
|
||||
.getDownloadSourceFile(id, row.id, row.name)
|
||||
|
||||
@ -11,11 +11,11 @@
|
||||
<div class="mt-16 mb-16">
|
||||
<el-radio-group v-model="form.fileType" @change="radioChange" class="app-radio-button-group">
|
||||
<el-radio-button value="txt">{{ $t('views.document.fileType.txt.label') }}</el-radio-button>
|
||||
<el-radio-button value="mineru">高级学习</el-radio-button>
|
||||
<el-radio-button value="table">{{
|
||||
$t('views.document.fileType.table.label')
|
||||
}}</el-radio-button>
|
||||
<el-radio-button value="QA">{{ $t('views.document.fileType.QA.label') }}</el-radio-button>
|
||||
<el-radio-button value="mineru">MinerU</el-radio-button>
|
||||
</el-radio-group>
|
||||
</div>
|
||||
|
||||
@ -140,7 +140,7 @@
|
||||
<AppIcon iconName="app-warning-colorful" style="font-size: 16px"></AppIcon>
|
||||
</div>
|
||||
<div class="ml-16 lighter">
|
||||
<p>1. MinerU 提供高质量的 PDF 和 PPT 文档解析,支持复杂表格、图片、公式等内容</p>
|
||||
<p>1. 高级学习提供高质量的 PDF 和 PPT 文档解析,支持复杂表格、图片、公式等内容</p>
|
||||
<p>2. 支持的文件格式:PDF、PPT、PPTX</p>
|
||||
<p>
|
||||
3. {{ $t('views.document.tip.fileLimitCountTip1') }} {{ file_count_limit }}
|
||||
@ -361,7 +361,7 @@ const loadModels = async () => {
|
||||
// 分离大语言模型和视觉模型
|
||||
llmModels.value = response.data.filter((m: any) => m.model_type === 'LLM')
|
||||
visionModels.value = response.data.filter((m: any) =>
|
||||
m.model_type === 'IMAGE' || m.model_type === 'LLM' // LLM模型也可能支持视觉功能
|
||||
m.model_type === 'IMAGE' // 只显示IMAGE类型的视觉模型
|
||||
)
|
||||
}
|
||||
} catch (error) {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user