midyf model_id
This commit is contained in:
parent
070b3e0057
commit
575b04c10f
@ -914,21 +914,46 @@ class MinerUAPIClient:
|
|||||||
# List files in images directory
|
# List files in images directory
|
||||||
image_files = os.listdir(images_dir)
|
image_files = os.listdir(images_dir)
|
||||||
self.logger.info(f"mineru-api: found {len(image_files)} files in images directory")
|
self.logger.info(f"mineru-api: found {len(image_files)} files in images directory")
|
||||||
|
self.logger.info(f"mineru-api: image files in directory: {image_files[:10]}") # Show first 10 files
|
||||||
|
|
||||||
for img_filename in all_images:
|
# Copy ALL image files from images directory to temp_dir
|
||||||
src_img_path = os.path.join(images_dir, img_filename)
|
|
||||||
dest_img_path = os.path.join(temp_dir, img_filename)
|
|
||||||
|
|
||||||
if os.path.exists(src_img_path):
|
|
||||||
import shutil
|
import shutil
|
||||||
|
for img_file in image_files:
|
||||||
|
if img_file.endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
|
||||||
|
src_img_path = os.path.join(images_dir, img_file)
|
||||||
|
dest_img_path = os.path.join(temp_dir, img_file)
|
||||||
shutil.copy(src_img_path, dest_img_path)
|
shutil.copy(src_img_path, dest_img_path)
|
||||||
self.logger.info(f"mineru-api: copied image {img_filename} to {dest_img_path}")
|
self.logger.info(f"mineru-api: copied image {img_file} to temp_dir")
|
||||||
else:
|
|
||||||
self.logger.warning(f"mineru-api: image not found in images dir: {img_filename}")
|
# Also try to copy specific images referenced in content_list
|
||||||
# List available images for debugging
|
for img_filename in all_images:
|
||||||
matching_files = [f for f in image_files if img_filename in f]
|
# Try different possible paths and names
|
||||||
|
possible_names = [
|
||||||
|
img_filename,
|
||||||
|
img_filename.replace('.png', '.jpg'),
|
||||||
|
img_filename.replace('.jpg', '.png'),
|
||||||
|
os.path.basename(img_filename) # Just the filename without path
|
||||||
|
]
|
||||||
|
|
||||||
|
copied = False
|
||||||
|
for name in possible_names:
|
||||||
|
src_img_path = os.path.join(images_dir, name)
|
||||||
|
if os.path.exists(src_img_path):
|
||||||
|
dest_img_path = os.path.join(temp_dir, img_filename)
|
||||||
|
if not os.path.exists(dest_img_path):
|
||||||
|
shutil.copy(src_img_path, dest_img_path)
|
||||||
|
self.logger.info(f"mineru-api: copied referenced image {name} as {img_filename}")
|
||||||
|
copied = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not copied:
|
||||||
|
# Try to find similar files
|
||||||
|
base_name = os.path.splitext(img_filename)[0]
|
||||||
|
matching_files = [f for f in image_files if base_name in f]
|
||||||
if matching_files:
|
if matching_files:
|
||||||
self.logger.info(f"mineru-api: similar files found: {matching_files}")
|
self.logger.warning(f"mineru-api: image {img_filename} not found, but similar files exist: {matching_files}")
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"mineru-api: image {img_filename} not found in images dir")
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"mineru-api: images directory not found: {images_dir}")
|
self.logger.warning(f"mineru-api: images directory not found: {images_dir}")
|
||||||
|
|
||||||
|
|||||||
@ -1196,7 +1196,19 @@ class DocumentSerializers(serializers.Serializer):
|
|||||||
ProblemParagraphManage(problem_paragraph_object_list, knowledge_id).to_problem_model_list()
|
ProblemParagraphManage(problem_paragraph_object_list, knowledge_id).to_problem_model_list()
|
||||||
)
|
)
|
||||||
# 插入文档
|
# 插入文档
|
||||||
QuerySet(Document).bulk_create(document_model_list) if len(document_model_list) > 0 else None
|
if len(document_model_list) > 0:
|
||||||
|
QuerySet(Document).bulk_create(document_model_list)
|
||||||
|
# 确保文档已经保存到数据库
|
||||||
|
from django.db import connection
|
||||||
|
connection.cursor().execute("SELECT 1") # 确保之前的操作已提交
|
||||||
|
|
||||||
|
# 验证文档是否成功保存
|
||||||
|
for doc in document_model_list:
|
||||||
|
saved_doc = QuerySet(Document).filter(id=doc.id).first()
|
||||||
|
if saved_doc:
|
||||||
|
maxkb_logger.info(f"Document {doc.id} successfully saved to database")
|
||||||
|
else:
|
||||||
|
maxkb_logger.error(f"Document {doc.id} not found after bulk_create")
|
||||||
|
|
||||||
# 处理高级学习文档的异步任务
|
# 处理高级学习文档的异步任务
|
||||||
for idx, document in enumerate(instance_list):
|
for idx, document in enumerate(instance_list):
|
||||||
@ -1214,15 +1226,25 @@ class DocumentSerializers(serializers.Serializer):
|
|||||||
State.PENDING
|
State.PENDING
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# 提交异步任务前验证文档存在
|
||||||
|
verify_doc = QuerySet(Document).filter(id=document_model.id).first()
|
||||||
|
if not verify_doc:
|
||||||
|
maxkb_logger.error(f"Document {document_model.id} not found before submitting task")
|
||||||
|
continue
|
||||||
|
|
||||||
# 提交异步任务
|
# 提交异步任务
|
||||||
try:
|
try:
|
||||||
from knowledge.tasks.advanced_learning import advanced_learning_by_document
|
from knowledge.tasks.advanced_learning import advanced_learning_by_document
|
||||||
advanced_learning_by_document.delay(
|
# 使用 apply_async 并添加延迟,确保事务提交后再执行
|
||||||
|
advanced_learning_by_document.apply_async(
|
||||||
|
args=[
|
||||||
str(document_model.id),
|
str(document_model.id),
|
||||||
str(knowledge_id),
|
str(knowledge_id),
|
||||||
self.data.get('workspace_id', ''),
|
self.data.get('workspace_id', ''),
|
||||||
llm_model_id,
|
llm_model_id,
|
||||||
vision_model_id
|
vision_model_id
|
||||||
|
],
|
||||||
|
countdown=2 # 延迟2秒执行
|
||||||
)
|
)
|
||||||
maxkb_logger.info(f"Advanced learning task submitted for document {document_model.id}")
|
maxkb_logger.info(f"Advanced learning task submitted for document {document_model.id}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@ -44,13 +44,13 @@ def advanced_learning_by_document(document_id: str, knowledge_id: str, workspace
|
|||||||
maxkb_logger.info(f"Updated document {document_id} status to PARSING")
|
maxkb_logger.info(f"Updated document {document_id} status to PARSING")
|
||||||
|
|
||||||
# 获取文档
|
# 获取文档
|
||||||
document = QuerySet(Document).filter(id=document_id).first()
|
document = Document.objects.filter(id=document_id).first()
|
||||||
if not document:
|
if not document:
|
||||||
maxkb_logger.error(f"Document {document_id} not found")
|
maxkb_logger.error(f"Document {document_id} not found in database")
|
||||||
return
|
return
|
||||||
|
|
||||||
# 获取知识库
|
# 获取知识库
|
||||||
knowledge = QuerySet(Knowledge).filter(id=knowledge_id).first()
|
knowledge = Knowledge.objects.filter(id=knowledge_id).first()
|
||||||
if not knowledge:
|
if not knowledge:
|
||||||
maxkb_logger.error(f"Knowledge {knowledge_id} not found")
|
maxkb_logger.error(f"Knowledge {knowledge_id} not found")
|
||||||
return
|
return
|
||||||
@ -66,7 +66,7 @@ def advanced_learning_by_document(document_id: str, knowledge_id: str, workspace
|
|||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
source_file = QuerySet(File).filter(id=source_file_id).first()
|
source_file = File.objects.filter(id=source_file_id).first()
|
||||||
if not source_file:
|
if not source_file:
|
||||||
maxkb_logger.warning(f"Source file not found for document {document.id}")
|
maxkb_logger.warning(f"Source file not found for document {document.id}")
|
||||||
ListenerManagement.update_status(
|
ListenerManagement.update_status(
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user