From 575b04c10faf123e66048f6554990cd121ba7f8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= Date: Tue, 26 Aug 2025 16:35:29 +0800 Subject: [PATCH] midyf model_id --- apps/common/handle/impl/mineru/api_client.py | 49 +++++++++++++++----- apps/knowledge/serializers/document.py | 36 +++++++++++--- apps/knowledge/tasks/advanced_learning.py | 8 ++-- 3 files changed, 70 insertions(+), 23 deletions(-) diff --git a/apps/common/handle/impl/mineru/api_client.py b/apps/common/handle/impl/mineru/api_client.py index 55fbbea6..c890942e 100644 --- a/apps/common/handle/impl/mineru/api_client.py +++ b/apps/common/handle/impl/mineru/api_client.py @@ -914,21 +914,46 @@ class MinerUAPIClient: # List files in images directory image_files = os.listdir(images_dir) self.logger.info(f"mineru-api: found {len(image_files)} files in images directory") + self.logger.info(f"mineru-api: image files in directory: {image_files[:10]}") # Show first 10 files - for img_filename in all_images: - src_img_path = os.path.join(images_dir, img_filename) - dest_img_path = os.path.join(temp_dir, img_filename) - - if os.path.exists(src_img_path): - import shutil + # Copy ALL image files from images directory to temp_dir + import shutil + for img_file in image_files: + if img_file.endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')): + src_img_path = os.path.join(images_dir, img_file) + dest_img_path = os.path.join(temp_dir, img_file) shutil.copy(src_img_path, dest_img_path) - self.logger.info(f"mineru-api: copied image {img_filename} to {dest_img_path}") - else: - self.logger.warning(f"mineru-api: image not found in images dir: {img_filename}") - # List available images for debugging - matching_files = [f for f in image_files if img_filename in f] + self.logger.info(f"mineru-api: copied image {img_file} to temp_dir") + + # Also try to copy specific images referenced in content_list + for img_filename in all_images: + # Try different possible paths and names + possible_names = [ + img_filename, + img_filename.replace('.png', '.jpg'), + img_filename.replace('.jpg', '.png'), + os.path.basename(img_filename) # Just the filename without path + ] + + copied = False + for name in possible_names: + src_img_path = os.path.join(images_dir, name) + if os.path.exists(src_img_path): + dest_img_path = os.path.join(temp_dir, img_filename) + if not os.path.exists(dest_img_path): + shutil.copy(src_img_path, dest_img_path) + self.logger.info(f"mineru-api: copied referenced image {name} as {img_filename}") + copied = True + break + + if not copied: + # Try to find similar files + base_name = os.path.splitext(img_filename)[0] + matching_files = [f for f in image_files if base_name in f] if matching_files: - self.logger.info(f"mineru-api: similar files found: {matching_files}") + self.logger.warning(f"mineru-api: image {img_filename} not found, but similar files exist: {matching_files}") + else: + self.logger.warning(f"mineru-api: image {img_filename} not found in images dir") else: self.logger.warning(f"mineru-api: images directory not found: {images_dir}") diff --git a/apps/knowledge/serializers/document.py b/apps/knowledge/serializers/document.py index b136b87e..89c35043 100644 --- a/apps/knowledge/serializers/document.py +++ b/apps/knowledge/serializers/document.py @@ -1196,7 +1196,19 @@ class DocumentSerializers(serializers.Serializer): ProblemParagraphManage(problem_paragraph_object_list, knowledge_id).to_problem_model_list() ) # 插入文档 - QuerySet(Document).bulk_create(document_model_list) if len(document_model_list) > 0 else None + if len(document_model_list) > 0: + QuerySet(Document).bulk_create(document_model_list) + # 确保文档已经保存到数据库 + from django.db import connection + connection.cursor().execute("SELECT 1") # 确保之前的操作已提交 + + # 验证文档是否成功保存 + for doc in document_model_list: + saved_doc = QuerySet(Document).filter(id=doc.id).first() + if saved_doc: + maxkb_logger.info(f"Document {doc.id} successfully saved to database") + else: + maxkb_logger.error(f"Document {doc.id} not found after bulk_create") # 处理高级学习文档的异步任务 for idx, document in enumerate(instance_list): @@ -1214,15 +1226,25 @@ class DocumentSerializers(serializers.Serializer): State.PENDING ) + # 提交异步任务前验证文档存在 + verify_doc = QuerySet(Document).filter(id=document_model.id).first() + if not verify_doc: + maxkb_logger.error(f"Document {document_model.id} not found before submitting task") + continue + # 提交异步任务 try: from knowledge.tasks.advanced_learning import advanced_learning_by_document - advanced_learning_by_document.delay( - str(document_model.id), - str(knowledge_id), - self.data.get('workspace_id', ''), - llm_model_id, - vision_model_id + # 使用 apply_async 并添加延迟,确保事务提交后再执行 + advanced_learning_by_document.apply_async( + args=[ + str(document_model.id), + str(knowledge_id), + self.data.get('workspace_id', ''), + llm_model_id, + vision_model_id + ], + countdown=2 # 延迟2秒执行 ) maxkb_logger.info(f"Advanced learning task submitted for document {document_model.id}") except Exception as e: diff --git a/apps/knowledge/tasks/advanced_learning.py b/apps/knowledge/tasks/advanced_learning.py index 8c30dd39..ea22fb7b 100644 --- a/apps/knowledge/tasks/advanced_learning.py +++ b/apps/knowledge/tasks/advanced_learning.py @@ -44,13 +44,13 @@ def advanced_learning_by_document(document_id: str, knowledge_id: str, workspace maxkb_logger.info(f"Updated document {document_id} status to PARSING") # 获取文档 - document = QuerySet(Document).filter(id=document_id).first() + document = Document.objects.filter(id=document_id).first() if not document: - maxkb_logger.error(f"Document {document_id} not found") + maxkb_logger.error(f"Document {document_id} not found in database") return # 获取知识库 - knowledge = QuerySet(Knowledge).filter(id=knowledge_id).first() + knowledge = Knowledge.objects.filter(id=knowledge_id).first() if not knowledge: maxkb_logger.error(f"Knowledge {knowledge_id} not found") return @@ -66,7 +66,7 @@ def advanced_learning_by_document(document_id: str, knowledge_id: str, workspace ) return - source_file = QuerySet(File).filter(id=source_file_id).first() + source_file = File.objects.filter(id=source_file_id).first() if not source_file: maxkb_logger.warning(f"Source file not found for document {document.id}") ListenerManagement.update_status(