音视频支持分段
Some checks are pending
sync2gitee / repo-sync (push) Waiting to run
Typos Check / Spell Check with Typos (push) Waiting to run

This commit is contained in:
朱潮 2025-12-18 23:12:18 +08:00
parent 3b143448a6
commit 8b85ad33f0
3 changed files with 185 additions and 20 deletions

View File

@ -29,19 +29,33 @@ class MediaSplitHandle(BaseSplitHandle):
def handle(self, file, pattern_list: List, with_filter: bool, limit: int,
get_buffer, save_image, **kwargs):
"""处理音视频文件"""
maxkb_logger.info(f"MediaSplitHandle.handle called with file: {file.name}")
maxkb_logger.info(f"Split parameters - limit: {limit}, patterns: {pattern_list}, with_filter: {with_filter}")
# 检查是否需要实际处理
use_actual_processing = kwargs.get('use_actual_processing', False)
stt_model_id = kwargs.get('stt_model_id')
if use_actual_processing and stt_model_id:
# 进行实际处理
return self._handle_actual_processing(file, get_buffer, **kwargs)
result = self._handle_actual_processing(file, get_buffer, **kwargs)
# 应用智能分块(如果需要)
chunk_limit = limit if limit > 0 else 1000 # 默认1000字符
if len(result.get('content', [])) > 0:
result = self._apply_smart_split(result, chunk_limit, with_filter)
return result
else:
# 使用默认文本
return self._handle_default_text(file, **kwargs)
result = self._handle_default_text(file, **kwargs)
# 即使是默认文本也应用分块如果有limit参数
if limit > 0:
result = self._apply_smart_split(result, limit, with_filter)
return result
def _get_audio_default_segments(self, file_name: str) -> List[dict]:
"""生成音频文件的默认段落"""
@ -296,4 +310,123 @@ class MediaSplitHandle(BaseSplitHandle):
if hours > 0:
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
else:
return f"{minutes:02d}:{secs:02d}"
return f"{minutes:02d}:{secs:02d}"
def smart_split_transcription(self, text, limit=1000, overlap=100):
"""智能分割转录文本"""
if len(text) <= limit:
return [text]
# 预处理:移除过多的空行
text = '\n'.join(line for line in text.split('\n') if line.strip())
chunks = []
start = 0
while start < len(text):
# 计算当前块的结束位置
end = min(start + limit, len(text))
# 如果不是最后一块,尝试在句号处断开
if end < len(text):
# 寻找最佳断点
best_end = end
# 优先在句号处断开
last_period = text.rfind('', start, end)
if last_period > start + limit // 2:
best_end = last_period + 1
else:
# 其次在换行符处断开
last_newline = text.rfind('\n', start, end)
if last_newline > start + limit // 3:
best_end = last_newline + 1
end = best_end
# 提取当前块
chunk = text[start:end].strip()
if chunk:
chunks.append(chunk)
# 计算下一块的开始位置(考虑重叠)
start = max(end - overlap, len(chunks[-1]) if chunks else 0)
# 避免无限循环
if start >= len(text):
break
return chunks
def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False):
"""应用智能分块到转录结果"""
overlap = 100 # 前后重叠字符数
new_paragraphs = []
for paragraph in result.get('content', []):
content = paragraph.get('content', '')
# 应用文本过滤(如果需要)
if with_filter:
content = self._clean_text(content)
if content:
# 应用智能分块
chunks = self.smart_split_transcription(content, limit, overlap)
# 如果只有一个块且没有变化,保持原样
if len(chunks) == 1 and chunks[0] == content:
new_paragraphs.append(paragraph)
else:
# 创建新的段落
for idx, chunk in enumerate(chunks):
# 保留原始元数据,但更新分段相关信息
metadata = paragraph.get('metadata', {}).copy()
metadata.update({
'chunk_index': idx,
'total_chunks': len(chunks),
'split_method': 'smart_transcription',
'split_limit': limit,
'split_overlap': overlap,
'with_filter': with_filter
})
new_paragraph = {
'content': chunk,
'title': f"{paragraph.get('title', '段落')} - 第{idx + 1}部分" if len(chunks) > 1 else paragraph.get('title', '段落'),
'metadata': metadata
}
new_paragraphs.append(new_paragraph)
else:
new_paragraphs.append(paragraph)
# 更新结果
result['content'] = new_paragraphs
# 更新元数据
metadata = result.get('metadata', {})
metadata['smart_split_applied'] = True
metadata['total_chunks'] = len(new_paragraphs)
result['metadata'] = metadata
maxkb_logger.info(f"Applied smart transcription split: {len(new_paragraphs)} chunks")
return result
def _clean_text(self, text):
"""清理文本:去掉重复多余符号空格、空行、制表符"""
import re
# 移除多余的空白字符
text = re.sub(r'\s+', ' ', text)
# 移除开头和结尾的空白
text = text.strip()
# 移除重复的标点符号
text = re.sub(r'([。!?,])\1+', r'\1', text)
# 移除多余的换行
text = re.sub(r'\n{3,}', '\n\n', text)
return text

View File

@ -249,15 +249,33 @@ def submit_advanced_learning_task(document_id, knowledge_id, workspace_id, llm_m
def submit_media_learning_task(document_id, knowledge_id, workspace_id, stt_model_id, llm_model_id):
"""提交音视频学习任务的辅助函数"""
try:
# 获取文档实例以访问分块参数
from knowledge.models import Document
document = QuerySet(Document).filter(id=document_id).first()
# 获取分块参数
limit = None
patterns = None
with_filter = None
if document and document.meta:
limit = document.meta.get('split_limit')
patterns = document.meta.get('split_patterns', [])
with_filter = document.meta.get('split_with_filter')
from knowledge.tasks.media_learning import media_learning_by_document
media_learning_by_document.delay(
document_id,
knowledge_id,
workspace_id,
stt_model_id,
llm_model_id
llm_model_id,
limit=limit,
patterns=patterns,
with_filter=with_filter
)
maxkb_logger.info(f"Media learning task submitted for document: {document_id}")
maxkb_logger.info(f"Split parameters - limit: {limit}, patterns: {patterns}, with_filter: {with_filter}")
except Exception as e:
maxkb_logger.error(f"Failed to submit media learning task: {str(e)}")
ListenerManagement.update_status(

View File

@ -16,27 +16,34 @@ from common.handle.impl.media.media_split_handle import MediaSplitHandle
@shared_task(name='media_learning_by_document')
def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id: str,
stt_model_id: str, llm_model_id: Optional[str] = None):
def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id: str,
stt_model_id: str, llm_model_id: Optional[str] = None,
limit: Optional[int] = None,
patterns: Optional[List[str]] = None,
with_filter: Optional[bool] = None):
"""
音视频文档异步处理任务 - 完整状态流转
状态流程
1. 排队中 (PENDING) - 任务已提交等待处理
2. 生成中 (STARTED) - 正在转写音视频内容
3. 索引中 (STARTED + 段落创建) - 正在创建段落和索引
4. 完成 (SUCCESS) - 处理完成
5. 失败 (FAILURE) - 处理失败
Args:
document_id: 文档ID
knowledge_id: 知识库ID
knowledge_id: 知识库ID
workspace_id: 工作空间ID
stt_model_id: STT模型ID
llm_model_id: LLM模型ID可选
limit: 分段长度可选
patterns: 分段正则列表可选
with_filter: 是否清除特殊字符可选
"""
maxkb_logger.info(f"🎬 Starting media learning task for document: {document_id}")
maxkb_logger.info(f"📋 Current status: PENDING (排队中)")
maxkb_logger.info(f"📝 Split parameters - limit: {limit}, patterns: {patterns}, with_filter: {with_filter}")
try:
# 验证文档存在
@ -81,12 +88,12 @@ def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id
def get_buffer(file_obj):
return file_obj.read()
# 处理音视频文件(禁用默认文本模式
# 处理音视频文件(传递分块参数
result = handler.handle(
file=temp_file,
pattern_list=[],
with_filter=False,
limit=0, # 不限制段落数量
pattern_list=patterns or [], # 使用传入的分段模式
with_filter=with_filter if with_filter is not None else False,
limit=limit if limit is not None else 0, # 0表示使用默认值在handle中会转为1000
get_buffer=get_buffer,
save_image=False,
stt_model_id=stt_model_id,
@ -180,24 +187,31 @@ def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id
@shared_task(name='media_learning_batch')
def media_learning_batch(document_id_list: List[str], knowledge_id: str, workspace_id: str,
stt_model_id: str, llm_model_id: Optional[str] = None):
stt_model_id: str, llm_model_id: Optional[str] = None,
limit: Optional[int] = None,
patterns: Optional[List[str]] = None,
with_filter: Optional[bool] = None):
"""
批量音视频处理任务
Args:
document_id_list: 文档ID列表
knowledge_id: 知识库ID
workspace_id: 工作空间ID
stt_model_id: STT模型ID
llm_model_id: LLM模型ID可选
limit: 分段长度可选
patterns: 分段正则列表可选
with_filter: 是否清除特殊字符可选
"""
maxkb_logger.info(f"🎬 Starting batch media learning for {len(document_id_list)} documents")
# 为每个文档提交单独的处理任务
for document_id in document_id_list:
try:
media_learning_by_document.delay(
document_id, knowledge_id, workspace_id, stt_model_id, llm_model_id
document_id, knowledge_id, workspace_id, stt_model_id, llm_model_id,
limit, patterns, with_filter
)
maxkb_logger.info(f"📋 Submitted media learning task for document: {document_id}")
except Exception as e: