diff --git a/apps/knowledge/serializers/document.py b/apps/knowledge/serializers/document.py index 1e9eeb05..5728b711 100644 --- a/apps/knowledge/serializers/document.py +++ b/apps/knowledge/serializers/document.py @@ -880,11 +880,6 @@ class DocumentSerializers(serializers.Serializer): @post(post_function=post_embedding) @transaction.atomic def save(self, instance: Dict, with_valid=False, **kwargs): - # 添加调试日志 - from common.utils.logger import maxkb_logger - maxkb_logger.info(f"Save called with instance keys: {list(instance.keys())}") - maxkb_logger.info(f"save method - limit: {instance.get('limit')}, split_patterns: {instance.get('patterns')}, with_filter: {instance.get('with_filter')}") - # 确保分块参数使用正确的键名 # 前端传递的是 limit, patterns, with_filter # 我们需要将其转换为 split_limit, split_patterns, split_with_filter @@ -1256,21 +1251,36 @@ class DocumentSerializers(serializers.Serializer): document_model_list = [] paragraph_model_list = [] problem_paragraph_object_list = [] - + # 处理MinerU类型和音视频类型的文档 from common.utils.logger import maxkb_logger import os - + # 添加详细日志 maxkb_logger.info(f"batch_save called with workspace_id: {workspace_id}, knowledge_id: {knowledge_id}") maxkb_logger.info(f"instance_list contains {len(instance_list)} documents") + + # 在处理文档之前,先确保所有分块参数都被正确转换 for idx, doc in enumerate(instance_list): + # 确保分块参数使用正确的键名 + # 前端传递的是 limit, patterns, with_filter + # 我们需要将其转换为 split_limit, split_patterns, split_with_filter + if doc.get('limit') is not None: + doc['split_limit'] = doc.get('limit') + maxkb_logger.info(f"Document {idx}: Converting limit={doc.get('limit')} to split_limit") + if doc.get('patterns') is not None: + doc['split_patterns'] = doc.get('patterns') + maxkb_logger.info(f"Document {idx}: Converting patterns={doc.get('patterns')} to split_patterns") + if doc.get('with_filter') is not None: + doc['split_with_filter'] = doc.get('with_filter') + maxkb_logger.info(f"Document {idx}: Converting with_filter={doc.get('with_filter')} to split_with_filter") + maxkb_logger.info(f"Document {idx}: {doc.keys()}") if 'stt_model_id' in doc: maxkb_logger.info(f" - stt_model_id present: {doc['stt_model_id']}") if 'llm_model_id' in doc: maxkb_logger.info(f" - llm_model_id present: {doc['llm_model_id']}") - + for document in instance_list: # 检查是否是MinerU类型的文档(需要同时有llm_model_id和vision_model_id) llm_model_id = document.get('llm_model_id') @@ -1296,16 +1306,6 @@ class DocumentSerializers(serializers.Serializer): # 插入文档 for document in instance_list: - # 确保分块参数使用正确的键名 - # 前端传递的是 limit, patterns, with_filter - # 我们需要将其转换为 split_limit, split_patterns, split_with_filter - if document.get('limit') is not None: - document['split_limit'] = document.get('limit') - if document.get('patterns') is not None: - document['split_patterns'] = document.get('patterns') - if document.get('with_filter') is not None: - document['split_with_filter'] = document.get('with_filter') - maxkb_logger.info(f"Processing document: {document.get('name')}, paragraphs count: {len(document.get('paragraphs', []))}") maxkb_logger.info(f"Document split params - split_limit: {document.get('split_limit')}, split_patterns: {document.get('split_patterns')}, split_with_filter: {document.get('split_with_filter')}") document_paragraph_dict_model = DocumentSerializers.Create.get_document_paragraph_model(