音视频支持分段
This commit is contained in:
parent
3b143448a6
commit
8b85ad33f0
@ -31,6 +31,7 @@ class MediaSplitHandle(BaseSplitHandle):
|
||||
"""处理音视频文件"""
|
||||
|
||||
maxkb_logger.info(f"MediaSplitHandle.handle called with file: {file.name}")
|
||||
maxkb_logger.info(f"Split parameters - limit: {limit}, patterns: {pattern_list}, with_filter: {with_filter}")
|
||||
|
||||
# 检查是否需要实际处理
|
||||
use_actual_processing = kwargs.get('use_actual_processing', False)
|
||||
@ -38,10 +39,23 @@ class MediaSplitHandle(BaseSplitHandle):
|
||||
|
||||
if use_actual_processing and stt_model_id:
|
||||
# 进行实际处理
|
||||
return self._handle_actual_processing(file, get_buffer, **kwargs)
|
||||
result = self._handle_actual_processing(file, get_buffer, **kwargs)
|
||||
|
||||
# 应用智能分块(如果需要)
|
||||
chunk_limit = limit if limit > 0 else 1000 # 默认1000字符
|
||||
if len(result.get('content', [])) > 0:
|
||||
result = self._apply_smart_split(result, chunk_limit, with_filter)
|
||||
|
||||
return result
|
||||
else:
|
||||
# 使用默认文本
|
||||
return self._handle_default_text(file, **kwargs)
|
||||
result = self._handle_default_text(file, **kwargs)
|
||||
|
||||
# 即使是默认文本也应用分块(如果有limit参数)
|
||||
if limit > 0:
|
||||
result = self._apply_smart_split(result, limit, with_filter)
|
||||
|
||||
return result
|
||||
|
||||
def _get_audio_default_segments(self, file_name: str) -> List[dict]:
|
||||
"""生成音频文件的默认段落"""
|
||||
@ -297,3 +311,122 @@ class MediaSplitHandle(BaseSplitHandle):
|
||||
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
|
||||
else:
|
||||
return f"{minutes:02d}:{secs:02d}"
|
||||
|
||||
def smart_split_transcription(self, text, limit=1000, overlap=100):
|
||||
"""智能分割转录文本"""
|
||||
if len(text) <= limit:
|
||||
return [text]
|
||||
|
||||
# 预处理:移除过多的空行
|
||||
text = '\n'.join(line for line in text.split('\n') if line.strip())
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
|
||||
while start < len(text):
|
||||
# 计算当前块的结束位置
|
||||
end = min(start + limit, len(text))
|
||||
|
||||
# 如果不是最后一块,尝试在句号处断开
|
||||
if end < len(text):
|
||||
# 寻找最佳断点
|
||||
best_end = end
|
||||
|
||||
# 优先在句号处断开
|
||||
last_period = text.rfind('。', start, end)
|
||||
if last_period > start + limit // 2:
|
||||
best_end = last_period + 1
|
||||
else:
|
||||
# 其次在换行符处断开
|
||||
last_newline = text.rfind('\n', start, end)
|
||||
if last_newline > start + limit // 3:
|
||||
best_end = last_newline + 1
|
||||
|
||||
end = best_end
|
||||
|
||||
# 提取当前块
|
||||
chunk = text[start:end].strip()
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
|
||||
# 计算下一块的开始位置(考虑重叠)
|
||||
start = max(end - overlap, len(chunks[-1]) if chunks else 0)
|
||||
|
||||
# 避免无限循环
|
||||
if start >= len(text):
|
||||
break
|
||||
|
||||
return chunks
|
||||
|
||||
def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False):
|
||||
"""应用智能分块到转录结果"""
|
||||
overlap = 100 # 前后重叠字符数
|
||||
|
||||
new_paragraphs = []
|
||||
for paragraph in result.get('content', []):
|
||||
content = paragraph.get('content', '')
|
||||
|
||||
# 应用文本过滤(如果需要)
|
||||
if with_filter:
|
||||
content = self._clean_text(content)
|
||||
|
||||
if content:
|
||||
# 应用智能分块
|
||||
chunks = self.smart_split_transcription(content, limit, overlap)
|
||||
|
||||
# 如果只有一个块且没有变化,保持原样
|
||||
if len(chunks) == 1 and chunks[0] == content:
|
||||
new_paragraphs.append(paragraph)
|
||||
else:
|
||||
# 创建新的段落
|
||||
for idx, chunk in enumerate(chunks):
|
||||
# 保留原始元数据,但更新分段相关信息
|
||||
metadata = paragraph.get('metadata', {}).copy()
|
||||
metadata.update({
|
||||
'chunk_index': idx,
|
||||
'total_chunks': len(chunks),
|
||||
'split_method': 'smart_transcription',
|
||||
'split_limit': limit,
|
||||
'split_overlap': overlap,
|
||||
'with_filter': with_filter
|
||||
})
|
||||
|
||||
new_paragraph = {
|
||||
'content': chunk,
|
||||
'title': f"{paragraph.get('title', '段落')} - 第{idx + 1}部分" if len(chunks) > 1 else paragraph.get('title', '段落'),
|
||||
'metadata': metadata
|
||||
}
|
||||
new_paragraphs.append(new_paragraph)
|
||||
else:
|
||||
new_paragraphs.append(paragraph)
|
||||
|
||||
# 更新结果
|
||||
result['content'] = new_paragraphs
|
||||
|
||||
# 更新元数据
|
||||
metadata = result.get('metadata', {})
|
||||
metadata['smart_split_applied'] = True
|
||||
metadata['total_chunks'] = len(new_paragraphs)
|
||||
result['metadata'] = metadata
|
||||
|
||||
maxkb_logger.info(f"Applied smart transcription split: {len(new_paragraphs)} chunks")
|
||||
|
||||
return result
|
||||
|
||||
def _clean_text(self, text):
|
||||
"""清理文本:去掉重复多余符号空格、空行、制表符"""
|
||||
import re
|
||||
|
||||
# 移除多余的空白字符
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
|
||||
# 移除开头和结尾的空白
|
||||
text = text.strip()
|
||||
|
||||
# 移除重复的标点符号
|
||||
text = re.sub(r'([。!?,])\1+', r'\1', text)
|
||||
|
||||
# 移除多余的换行
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
|
||||
return text
|
||||
@ -249,15 +249,33 @@ def submit_advanced_learning_task(document_id, knowledge_id, workspace_id, llm_m
|
||||
def submit_media_learning_task(document_id, knowledge_id, workspace_id, stt_model_id, llm_model_id):
|
||||
"""提交音视频学习任务的辅助函数"""
|
||||
try:
|
||||
# 获取文档实例以访问分块参数
|
||||
from knowledge.models import Document
|
||||
document = QuerySet(Document).filter(id=document_id).first()
|
||||
|
||||
# 获取分块参数
|
||||
limit = None
|
||||
patterns = None
|
||||
with_filter = None
|
||||
|
||||
if document and document.meta:
|
||||
limit = document.meta.get('split_limit')
|
||||
patterns = document.meta.get('split_patterns', [])
|
||||
with_filter = document.meta.get('split_with_filter')
|
||||
|
||||
from knowledge.tasks.media_learning import media_learning_by_document
|
||||
media_learning_by_document.delay(
|
||||
document_id,
|
||||
knowledge_id,
|
||||
workspace_id,
|
||||
stt_model_id,
|
||||
llm_model_id
|
||||
llm_model_id,
|
||||
limit=limit,
|
||||
patterns=patterns,
|
||||
with_filter=with_filter
|
||||
)
|
||||
maxkb_logger.info(f"Media learning task submitted for document: {document_id}")
|
||||
maxkb_logger.info(f"Split parameters - limit: {limit}, patterns: {patterns}, with_filter: {with_filter}")
|
||||
except Exception as e:
|
||||
maxkb_logger.error(f"Failed to submit media learning task: {str(e)}")
|
||||
ListenerManagement.update_status(
|
||||
|
||||
@ -17,7 +17,10 @@ from common.handle.impl.media.media_split_handle import MediaSplitHandle
|
||||
|
||||
@shared_task(name='media_learning_by_document')
|
||||
def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id: str,
|
||||
stt_model_id: str, llm_model_id: Optional[str] = None):
|
||||
stt_model_id: str, llm_model_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
patterns: Optional[List[str]] = None,
|
||||
with_filter: Optional[bool] = None):
|
||||
"""
|
||||
音视频文档异步处理任务 - 完整状态流转
|
||||
|
||||
@ -34,9 +37,13 @@ def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id
|
||||
workspace_id: 工作空间ID
|
||||
stt_model_id: STT模型ID
|
||||
llm_model_id: LLM模型ID(可选)
|
||||
limit: 分段长度(可选)
|
||||
patterns: 分段正则列表(可选)
|
||||
with_filter: 是否清除特殊字符(可选)
|
||||
"""
|
||||
maxkb_logger.info(f"🎬 Starting media learning task for document: {document_id}")
|
||||
maxkb_logger.info(f"📋 Current status: PENDING (排队中)")
|
||||
maxkb_logger.info(f"📝 Split parameters - limit: {limit}, patterns: {patterns}, with_filter: {with_filter}")
|
||||
|
||||
try:
|
||||
# 验证文档存在
|
||||
@ -81,12 +88,12 @@ def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id
|
||||
def get_buffer(file_obj):
|
||||
return file_obj.read()
|
||||
|
||||
# 处理音视频文件(禁用默认文本模式)
|
||||
# 处理音视频文件(传递分块参数)
|
||||
result = handler.handle(
|
||||
file=temp_file,
|
||||
pattern_list=[],
|
||||
with_filter=False,
|
||||
limit=0, # 不限制段落数量
|
||||
pattern_list=patterns or [], # 使用传入的分段模式
|
||||
with_filter=with_filter if with_filter is not None else False,
|
||||
limit=limit if limit is not None else 0, # 0表示使用默认值(在handle中会转为1000)
|
||||
get_buffer=get_buffer,
|
||||
save_image=False,
|
||||
stt_model_id=stt_model_id,
|
||||
@ -180,7 +187,10 @@ def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id
|
||||
|
||||
@shared_task(name='media_learning_batch')
|
||||
def media_learning_batch(document_id_list: List[str], knowledge_id: str, workspace_id: str,
|
||||
stt_model_id: str, llm_model_id: Optional[str] = None):
|
||||
stt_model_id: str, llm_model_id: Optional[str] = None,
|
||||
limit: Optional[int] = None,
|
||||
patterns: Optional[List[str]] = None,
|
||||
with_filter: Optional[bool] = None):
|
||||
"""
|
||||
批量音视频处理任务
|
||||
|
||||
@ -190,6 +200,9 @@ def media_learning_batch(document_id_list: List[str], knowledge_id: str, workspa
|
||||
workspace_id: 工作空间ID
|
||||
stt_model_id: STT模型ID
|
||||
llm_model_id: LLM模型ID(可选)
|
||||
limit: 分段长度(可选)
|
||||
patterns: 分段正则列表(可选)
|
||||
with_filter: 是否清除特殊字符(可选)
|
||||
"""
|
||||
maxkb_logger.info(f"🎬 Starting batch media learning for {len(document_id_list)} documents")
|
||||
|
||||
@ -197,7 +210,8 @@ def media_learning_batch(document_id_list: List[str], knowledge_id: str, workspa
|
||||
for document_id in document_id_list:
|
||||
try:
|
||||
media_learning_by_document.delay(
|
||||
document_id, knowledge_id, workspace_id, stt_model_id, llm_model_id
|
||||
document_id, knowledge_id, workspace_id, stt_model_id, llm_model_id,
|
||||
limit, patterns, with_filter
|
||||
)
|
||||
maxkb_logger.info(f"📋 Submitted media learning task for document: {document_id}")
|
||||
except Exception as e:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user