From 575b04c10faf123e66048f6554990cd121ba7f8c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= <zhuchaowe@users.noreply.github.com>
Date: Tue, 26 Aug 2025 16:35:29 +0800
Subject: [PATCH] midyf model_id

---
 apps/common/handle/impl/mineru/api_client.py | 49 +++++++++++++++-----
 apps/knowledge/serializers/document.py       | 36 +++++++++++---
 apps/knowledge/tasks/advanced_learning.py    |  8 ++--
 3 files changed, 70 insertions(+), 23 deletions(-)

diff --git a/apps/common/handle/impl/mineru/api_client.py b/apps/common/handle/impl/mineru/api_client.py
index 55fbbea6..c890942e 100644
--- a/apps/common/handle/impl/mineru/api_client.py
+++ b/apps/common/handle/impl/mineru/api_client.py
@@ -914,21 +914,46 @@ class MinerUAPIClient:
                 # List files in images directory
                 image_files = os.listdir(images_dir)
                 self.logger.info(f"mineru-api: found {len(image_files)} files in images directory")
+                self.logger.info(f"mineru-api: image files in directory: {image_files[:10]}")  # Show first 10 files
                 
-                for img_filename in all_images:
-                    src_img_path = os.path.join(images_dir, img_filename)
-                    dest_img_path = os.path.join(temp_dir, img_filename)
-                    
-                    if os.path.exists(src_img_path):
-                        import shutil
+                # Copy ALL image files from images directory to temp_dir
+                import shutil
+                for img_file in image_files:
+                    if img_file.endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
+                        src_img_path = os.path.join(images_dir, img_file)
+                        dest_img_path = os.path.join(temp_dir, img_file)
                         shutil.copy(src_img_path, dest_img_path)
-                        self.logger.info(f"mineru-api: copied image {img_filename} to {dest_img_path}")
-                    else:
-                        self.logger.warning(f"mineru-api: image not found in images dir: {img_filename}")
-                        # List available images for debugging
-                        matching_files = [f for f in image_files if img_filename in f]
+                        self.logger.info(f"mineru-api: copied image {img_file} to temp_dir")
+                
+                # Also try to copy specific images referenced in content_list
+                for img_filename in all_images:
+                    # Try different possible paths and names
+                    possible_names = [
+                        img_filename,
+                        img_filename.replace('.png', '.jpg'),
+                        img_filename.replace('.jpg', '.png'),
+                        os.path.basename(img_filename)  # Just the filename without path
+                    ]
+                    
+                    copied = False
+                    for name in possible_names:
+                        src_img_path = os.path.join(images_dir, name)
+                        if os.path.exists(src_img_path):
+                            dest_img_path = os.path.join(temp_dir, img_filename)
+                            if not os.path.exists(dest_img_path):
+                                shutil.copy(src_img_path, dest_img_path)
+                                self.logger.info(f"mineru-api: copied referenced image {name} as {img_filename}")
+                                copied = True
+                                break
+                    
+                    if not copied:
+                        # Try to find similar files
+                        base_name = os.path.splitext(img_filename)[0]
+                        matching_files = [f for f in image_files if base_name in f]
                         if matching_files:
-                            self.logger.info(f"mineru-api: similar files found: {matching_files}")
+                            self.logger.warning(f"mineru-api: image {img_filename} not found, but similar files exist: {matching_files}")
+                        else:
+                            self.logger.warning(f"mineru-api: image {img_filename} not found in images dir")
             else:
                 self.logger.warning(f"mineru-api: images directory not found: {images_dir}")
             
diff --git a/apps/knowledge/serializers/document.py b/apps/knowledge/serializers/document.py
index b136b87e..89c35043 100644
--- a/apps/knowledge/serializers/document.py
+++ b/apps/knowledge/serializers/document.py
@@ -1196,7 +1196,19 @@ class DocumentSerializers(serializers.Serializer):
                 ProblemParagraphManage(problem_paragraph_object_list, knowledge_id).to_problem_model_list()
             )
             # 插入文档
-            QuerySet(Document).bulk_create(document_model_list) if len(document_model_list) > 0 else None
+            if len(document_model_list) > 0:
+                QuerySet(Document).bulk_create(document_model_list)
+                # 确保文档已经保存到数据库
+                from django.db import connection
+                connection.cursor().execute("SELECT 1")  # 确保之前的操作已提交
+                
+                # 验证文档是否成功保存
+                for doc in document_model_list:
+                    saved_doc = QuerySet(Document).filter(id=doc.id).first()
+                    if saved_doc:
+                        maxkb_logger.info(f"Document {doc.id} successfully saved to database")
+                    else:
+                        maxkb_logger.error(f"Document {doc.id} not found after bulk_create")
             
             # 处理高级学习文档的异步任务
             for idx, document in enumerate(instance_list):
@@ -1214,15 +1226,25 @@ class DocumentSerializers(serializers.Serializer):
                         State.PENDING
                     )
                     
+                    # 提交异步任务前验证文档存在
+                    verify_doc = QuerySet(Document).filter(id=document_model.id).first()
+                    if not verify_doc:
+                        maxkb_logger.error(f"Document {document_model.id} not found before submitting task")
+                        continue
+                        
                     # 提交异步任务
                     try:
                         from knowledge.tasks.advanced_learning import advanced_learning_by_document
-                        advanced_learning_by_document.delay(
-                            str(document_model.id),
-                            str(knowledge_id),
-                            self.data.get('workspace_id', ''),
-                            llm_model_id,
-                            vision_model_id
+                        # 使用 apply_async 并添加延迟，确保事务提交后再执行
+                        advanced_learning_by_document.apply_async(
+                            args=[
+                                str(document_model.id),
+                                str(knowledge_id),
+                                self.data.get('workspace_id', ''),
+                                llm_model_id,
+                                vision_model_id
+                            ],
+                            countdown=2  # 延迟2秒执行
                         )
                         maxkb_logger.info(f"Advanced learning task submitted for document {document_model.id}")
                     except Exception as e:
diff --git a/apps/knowledge/tasks/advanced_learning.py b/apps/knowledge/tasks/advanced_learning.py
index 8c30dd39..ea22fb7b 100644
--- a/apps/knowledge/tasks/advanced_learning.py
+++ b/apps/knowledge/tasks/advanced_learning.py
@@ -44,13 +44,13 @@ def advanced_learning_by_document(document_id: str, knowledge_id: str, workspace
         maxkb_logger.info(f"Updated document {document_id} status to PARSING")
         
         # 获取文档
-        document = QuerySet(Document).filter(id=document_id).first()
+        document = Document.objects.filter(id=document_id).first()
         if not document:
-            maxkb_logger.error(f"Document {document_id} not found")
+            maxkb_logger.error(f"Document {document_id} not found in database")
             return
         
         # 获取知识库
-        knowledge = QuerySet(Knowledge).filter(id=knowledge_id).first()
+        knowledge = Knowledge.objects.filter(id=knowledge_id).first()
         if not knowledge:
             maxkb_logger.error(f"Knowledge {knowledge_id} not found")
             return
@@ -66,7 +66,7 @@ def advanced_learning_by_document(document_id: str, knowledge_id: str, workspace
             )
             return
         
-        source_file = QuerySet(File).filter(id=source_file_id).first()
+        source_file = File.objects.filter(id=source_file_id).first()
         if not source_file:
             maxkb_logger.warning(f"Source file not found for document {document.id}")
             ListenerManagement.update_status(