maxkb/apps/knowledge/tasks/media_learning.py
朱潮 5da36659c2 修复音视频处理的关键问题
1. 修复Paragraph模型构造错误:
   - 将meta参数改为status_meta
   - 添加必需的knowledge_id参数

2. 修复使用demo数据的问题:
   - 移除所有demo数据生成代码
   - 改为调用实际的音频处理逻辑
   - 通过MediaSplitHandle进行实际处理

3. 增强MediaSplitHandle功能:
   - 支持实际处理和默认文本两种模式
   - 根据use_actual_processing参数选择处理方式
   - 保持向后兼容性

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-31 01:44:54 +08:00

215 lines
8.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""
音视频学习任务处理 - 完全异步化状态流转
"""
import traceback
from typing import List, Optional
from celery import shared_task
from django.db import transaction
from django.db.models import QuerySet
from common.event import ListenerManagement
from knowledge.tasks.embedding import embedding_by_data_source
from common.utils.logger import maxkb_logger
from knowledge.models import Document, Paragraph, TaskType, State, File, FileSourceType
from common.handle.impl.media.media_split_handle import MediaSplitHandle
@shared_task(name='media_learning_by_document')
def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id: str,
stt_model_id: str, llm_model_id: Optional[str] = None):
"""
音视频文档异步处理任务 - 完整状态流转
状态流程:
1. 排队中 (PENDING) - 任务已提交,等待处理
2. 生成中 (STARTED) - 正在转写音视频内容
3. 索引中 (STARTED + 段落创建) - 正在创建段落和索引
4. 完成 (SUCCESS) - 处理完成
5. 失败 (FAILURE) - 处理失败
Args:
document_id: 文档ID
knowledge_id: 知识库ID
workspace_id: 工作空间ID
stt_model_id: STT模型ID
llm_model_id: LLM模型ID可选
"""
maxkb_logger.info(f"🎬 Starting media learning task for document: {document_id}")
maxkb_logger.info(f"📋 Current status: PENDING (排队中)")
try:
# 验证文档存在
document = QuerySet(Document).filter(id=document_id).first()
if not document:
raise ValueError(f"Document not found: {document_id}")
# 验证源文件
source_file_id = document.meta.get('source_file_id')
if not source_file_id:
raise ValueError(f"Source file not found for document: {document_id}")
source_file = QuerySet(File).filter(id=source_file_id).first()
if not source_file:
raise ValueError(f"Source file not found: {source_file_id}")
maxkb_logger.info(f"🎵 Processing media file: {source_file.file_name}")
# 第1步更新状态为生成中音视频转写
maxkb_logger.info(f"🔄 Updating status to: STARTED (生成中)")
ListenerManagement.update_status(
QuerySet(Document).filter(id=document_id),
TaskType.EMBEDDING,
State.STARTED
)
# 实际处理音视频文件
maxkb_logger.info(f"📝 Processing media file: {source_file.file_name}")
# 使用MediaSplitHandle进行实际处理
try:
from common.handle.impl.media.media_split_handle import MediaSplitHandle
from django.core.files.base import ContentFile
# 创建处理器
handler = MediaSplitHandle()
# 创建临时文件对象
temp_file = ContentFile(source_file.get_bytes(), name=source_file.file_name)
# 获取文件内容的函数
def get_buffer(file_obj):
return file_obj.read()
# 处理音视频文件(禁用默认文本模式)
result = handler.handle(
file=temp_file,
pattern_list=[],
with_filter=False,
limit=0, # 不限制段落数量
get_buffer=get_buffer,
save_image=False,
stt_model_id=stt_model_id,
llm_model_id=llm_model_id,
workspace_id=workspace_id,
use_actual_processing=True # 标记需要实际处理
)
# 提取段落数据
paragraphs_data = []
for paragraph in result.get('content', []):
paragraphs_data.append({
'content': paragraph['content'],
'title': paragraph['title'],
'metadata': paragraph.get('metadata', {})
})
maxkb_logger.info(f"✅ Successfully processed media file, generated {len(paragraphs_data)} paragraphs")
except Exception as processing_error:
maxkb_logger.error(f"❌ Failed to process media file: {str(processing_error)}")
# 如果处理失败,生成基础段落
paragraphs_data = [{
'content': f'音视频文件 "{source_file.file_name}" 处理失败: {str(processing_error)}',
'title': '处理失败',
'metadata': {
'error': str(processing_error),
'file_name': source_file.file_name
}
]
maxkb_logger.info(f"📝 Generated {len(paragraphs_data)} paragraphs for media file")
# 第2步更新状态为索引中段落创建和向量化
maxkb_logger.info(f"📚 Updating status to: STARTED (索引中)")
# 状态保持为STARTED但通过日志区分阶段
# 创建段落对象
with transaction.atomic():
paragraph_models = []
for idx, para_data in enumerate(paragraphs_data):
paragraph = Paragraph(
document_id=document_id,
knowledge_id=knowledge_id,
content=para_data.get('content', ''),
title=para_data.get('title', f'段落 {idx + 1}'),
position=idx + 1,
status_meta=para_data.get('metadata', {})
)
paragraph_models.append(paragraph)
# 批量保存段落
if paragraph_models:
QuerySet(Paragraph).bulk_create(paragraph_models)
maxkb_logger.info(f"✅ Created {len(paragraph_models)} paragraphs for document {document_id}")
# 更新文档字符长度
total_char_length = sum(len(p.content) for p in paragraph_models)
document.char_length = total_char_length
document.save()
# 第3步触发向量化任务
maxkb_logger.info(f"🔍 Starting embedding for document: {document_id}")
embedding_by_data_source(document_id, knowledge_id, workspace_id)
# 第4步更新状态为完成
maxkb_logger.info(f"✅ Updating status to: SUCCESS (完成)")
ListenerManagement.update_status(
QuerySet(Document).filter(id=document_id),
TaskType.EMBEDDING,
State.SUCCESS
)
maxkb_logger.info(f"🎉 Media learning completed successfully for document: {document_id}")
maxkb_logger.info(f"📊 Final stats: {len(paragraph_models)} paragraphs, {total_char_length} characters")
except Exception as e:
maxkb_logger.error(f"❌ Media learning failed for document {document_id}: {str(e)}")
maxkb_logger.error(traceback.format_exc())
# 更新文档状态为失败
maxkb_logger.info(f"💥 Updating status to: FAILURE (失败)")
ListenerManagement.update_status(
QuerySet(Document).filter(id=document_id),
TaskType.EMBEDDING,
State.FAILURE
)
raise
@shared_task(name='media_learning_batch')
def media_learning_batch(document_id_list: List[str], knowledge_id: str, workspace_id: str,
stt_model_id: str, llm_model_id: Optional[str] = None):
"""
批量音视频处理任务
Args:
document_id_list: 文档ID列表
knowledge_id: 知识库ID
workspace_id: 工作空间ID
stt_model_id: STT模型ID
llm_model_id: LLM模型ID可选
"""
maxkb_logger.info(f"🎬 Starting batch media learning for {len(document_id_list)} documents")
# 为每个文档提交单独的处理任务
for document_id in document_id_list:
try:
media_learning_by_document.delay(
document_id, knowledge_id, workspace_id, stt_model_id, llm_model_id
)
maxkb_logger.info(f"📋 Submitted media learning task for document: {document_id}")
except Exception as e:
maxkb_logger.error(f"Failed to submit task for document {document_id}: {str(e)}")
# 更新失败状态
try:
ListenerManagement.update_status(
QuerySet(Document).filter(id=document_id),
TaskType.EMBEDDING,
State.FAILURE
)
except Exception as status_error:
maxkb_logger.error(f"Failed to update status for document {document_id}: {str(status_error)}")
maxkb_logger.info(f"✅ Batch media learning tasks submitted")