参考高级学习处理流程修改音视频文档处理
- 修改批量插入段落逻辑,跳过音视频文档的段落插入 - 音视频文档段落由异步任务处理,不在此处插入 - 保持与高级学习文档相同的处理模式 - 添加详细的日志记录用于调试 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
86ef54fb75
commit
459b0c8307
@ -1205,73 +1205,12 @@ class DocumentSerializers(serializers.Serializer):
|
|||||||
document['paragraphs'] = [] # 清空段落,等待异步处理
|
document['paragraphs'] = [] # 清空段落,等待异步处理
|
||||||
# 检查是否是音视频类型的文档
|
# 检查是否是音视频类型的文档
|
||||||
elif stt_model_id:
|
elif stt_model_id:
|
||||||
maxkb_logger.info(f"Document {document.get('name')} is media type, processing synchronously")
|
maxkb_logger.info(f"Document {document.get('name')} is media type, will process asynchronously")
|
||||||
# 音视频类型的文档,直接处理
|
# 音视频类型的文档,设置为异步处理
|
||||||
source_file_id = document.get('source_file_id')
|
# 清空段落,等待异步任务处理
|
||||||
if source_file_id:
|
document['paragraphs'] = []
|
||||||
try:
|
# 标记为异步音视频文档,用于后续异步处理
|
||||||
source_file = QuerySet(File).filter(id=source_file_id).first()
|
document['is_media_async'] = True
|
||||||
if source_file:
|
|
||||||
workspace_id_value = self.data.get('workspace_id', '')
|
|
||||||
maxkb_logger.info(f"Processing media file: {source_file.file_name}")
|
|
||||||
maxkb_logger.info(f" - STT model ID: {stt_model_id}")
|
|
||||||
maxkb_logger.info(f" - Workspace ID: {workspace_id_value}")
|
|
||||||
maxkb_logger.info(f" - LLM model ID: {llm_model_id}")
|
|
||||||
|
|
||||||
# 使用MediaSplitHandle处理音视频文件
|
|
||||||
from common.handle.impl.media.media_split_handle import MediaSplitHandle
|
|
||||||
media_handler = MediaSplitHandle()
|
|
||||||
|
|
||||||
# 准备文件对象
|
|
||||||
class FileWrapper:
|
|
||||||
def __init__(self, file_obj):
|
|
||||||
self.file_obj = file_obj
|
|
||||||
self.name = file_obj.file_name
|
|
||||||
self.size = file_obj.file_size
|
|
||||||
|
|
||||||
def read(self):
|
|
||||||
return self.file_obj.get_bytes()
|
|
||||||
|
|
||||||
def seek(self, pos):
|
|
||||||
pass
|
|
||||||
|
|
||||||
file_wrapper = FileWrapper(source_file)
|
|
||||||
|
|
||||||
# 处理音视频文件
|
|
||||||
result = media_handler.handle(
|
|
||||||
file_wrapper,
|
|
||||||
pattern_list=[],
|
|
||||||
with_filter=False,
|
|
||||||
limit=0,
|
|
||||||
get_buffer=lambda f: f.read(),
|
|
||||||
save_image=lambda x: None,
|
|
||||||
workspace_id=workspace_id_value,
|
|
||||||
stt_model_id=stt_model_id,
|
|
||||||
llm_model_id=llm_model_id
|
|
||||||
)
|
|
||||||
|
|
||||||
# 将处理结果添加到文档
|
|
||||||
if result and result.get('content'):
|
|
||||||
document['paragraphs'] = result.get('content', [])
|
|
||||||
maxkb_logger.info(f"Media file processed, got {len(document['paragraphs'])} paragraphs")
|
|
||||||
else:
|
|
||||||
maxkb_logger.warning(f"No content extracted from media file, using default")
|
|
||||||
document['paragraphs'] = [{
|
|
||||||
'content': f'[音视频文件: {source_file.file_name}]',
|
|
||||||
'title': '音视频内容'
|
|
||||||
}]
|
|
||||||
except Exception as e:
|
|
||||||
maxkb_logger.error(f"Failed to process media file: {str(e)}")
|
|
||||||
import traceback
|
|
||||||
maxkb_logger.error(traceback.format_exc())
|
|
||||||
# 如果处理失败,创建一个默认段落
|
|
||||||
document['paragraphs'] = [{
|
|
||||||
'content': f'[音视频文件处理失败: {str(e)}]',
|
|
||||||
'title': '处理失败'
|
|
||||||
}]
|
|
||||||
else:
|
|
||||||
maxkb_logger.warning(f"No source file for media document")
|
|
||||||
document['paragraphs'] = []
|
|
||||||
|
|
||||||
# 插入文档
|
# 插入文档
|
||||||
for document in instance_list:
|
for document in instance_list:
|
||||||
@ -1373,10 +1312,25 @@ class DocumentSerializers(serializers.Serializer):
|
|||||||
State.FAILURE
|
State.FAILURE
|
||||||
)
|
)
|
||||||
|
|
||||||
# 批量插入段落(只为非高级学习文档)
|
# 批量插入段落(只为非高级学习文档和非音视频文档)
|
||||||
if len(paragraph_model_list) > 0:
|
if len(paragraph_model_list) > 0:
|
||||||
maxkb_logger.info(f"Total paragraphs to insert: {len(paragraph_model_list)}")
|
maxkb_logger.info(f"Total paragraphs to insert: {len(paragraph_model_list)}")
|
||||||
|
|
||||||
|
# 获取音视频文档ID列表
|
||||||
|
media_document_ids = []
|
||||||
|
for idx, document in enumerate(instance_list):
|
||||||
|
stt_model_id = document.get('stt_model_id')
|
||||||
|
if stt_model_id and idx < len(document_model_list):
|
||||||
|
media_document_ids.append(str(document_model_list[idx].id))
|
||||||
|
|
||||||
|
maxkb_logger.info(f"Media document IDs to skip paragraph insertion: {media_document_ids}")
|
||||||
|
|
||||||
for document in document_model_list:
|
for document in document_model_list:
|
||||||
|
# 跳过高级学习文档和音视频文档的段落插入
|
||||||
|
if str(document.id) in media_document_ids:
|
||||||
|
maxkb_logger.info(f"Skipping paragraph insertion for media document: {document.id}")
|
||||||
|
continue
|
||||||
|
|
||||||
max_position = Paragraph.objects.filter(document_id=document.id).aggregate(
|
max_position = Paragraph.objects.filter(document_id=document.id).aggregate(
|
||||||
max_position=Max('position')
|
max_position=Max('position')
|
||||||
)['max_position'] or 0
|
)['max_position'] or 0
|
||||||
@ -1410,17 +1364,67 @@ class DocumentSerializers(serializers.Serializer):
|
|||||||
with_search_one=False
|
with_search_one=False
|
||||||
)
|
)
|
||||||
|
|
||||||
# 标记高级学习文档和音视频文档
|
# 标记高级学习文档和音视频文档,并触发异步任务
|
||||||
for idx, document in enumerate(instance_list):
|
for idx, document in enumerate(instance_list):
|
||||||
llm_model_id = document.get('llm_model_id')
|
llm_model_id = document.get('llm_model_id')
|
||||||
vision_model_id = document.get('vision_model_id')
|
vision_model_id = document.get('vision_model_id')
|
||||||
stt_model_id = document.get('stt_model_id')
|
stt_model_id = document.get('stt_model_id')
|
||||||
|
|
||||||
if idx < len(document_result_list):
|
if idx < len(document_result_list):
|
||||||
|
document_id = document_result_list[idx].get('id')
|
||||||
|
|
||||||
if llm_model_id and vision_model_id:
|
if llm_model_id and vision_model_id:
|
||||||
document_result_list[idx]['is_advanced_learning'] = True
|
document_result_list[idx]['is_advanced_learning'] = True
|
||||||
|
# 触发高级学习异步任务
|
||||||
|
try:
|
||||||
|
from knowledge.tasks.advanced_learning import batch_advanced_learning
|
||||||
|
batch_advanced_learning.delay(
|
||||||
|
[document_id],
|
||||||
|
str(knowledge_id),
|
||||||
|
workspace_id,
|
||||||
|
llm_model_id,
|
||||||
|
vision_model_id
|
||||||
|
)
|
||||||
|
maxkb_logger.info(f"Submitted advanced learning task for document: {document_id}")
|
||||||
|
except Exception as e:
|
||||||
|
maxkb_logger.error(f"Failed to submit advanced learning task: {str(e)}")
|
||||||
|
|
||||||
elif stt_model_id:
|
elif stt_model_id:
|
||||||
document_result_list[idx]['is_media_learning'] = True
|
document_result_list[idx]['is_media_learning'] = True
|
||||||
|
# 设置排队状态并触发音视频异步任务
|
||||||
|
try:
|
||||||
|
from common.event import ListenerManagement
|
||||||
|
from knowledge.models import TaskType, State
|
||||||
|
|
||||||
|
# 更新文档状态为排队中
|
||||||
|
ListenerManagement.update_status(
|
||||||
|
QuerySet(Document).filter(id=document_id),
|
||||||
|
TaskType.EMBEDDING,
|
||||||
|
State.PENDING
|
||||||
|
)
|
||||||
|
|
||||||
|
# 触发音视频异步处理任务
|
||||||
|
from knowledge.tasks.media_learning import media_learning_by_document
|
||||||
|
media_learning_by_document.delay(
|
||||||
|
document_id,
|
||||||
|
str(knowledge_id),
|
||||||
|
workspace_id,
|
||||||
|
stt_model_id,
|
||||||
|
llm_model_id
|
||||||
|
)
|
||||||
|
maxkb_logger.info(f"Submitted media learning task for document: {document_id}, status: PENDING")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
maxkb_logger.error(f"Failed to submit media learning task: {str(e)}")
|
||||||
|
# 如果提交任务失败,更新状态为失败
|
||||||
|
try:
|
||||||
|
ListenerManagement.update_status(
|
||||||
|
QuerySet(Document).filter(id=document_id),
|
||||||
|
TaskType.EMBEDDING,
|
||||||
|
State.FAILURE
|
||||||
|
)
|
||||||
|
except Exception as status_error:
|
||||||
|
maxkb_logger.error(f"Failed to update status to FAILURE: {str(status_error)}")
|
||||||
|
|
||||||
return document_result_list, knowledge_id, workspace_id
|
return document_result_list, knowledge_id, workspace_id
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user