音视频支持分段
This commit is contained in:
parent
3b143448a6
commit
8b85ad33f0
@ -31,6 +31,7 @@ class MediaSplitHandle(BaseSplitHandle):
|
|||||||
"""处理音视频文件"""
|
"""处理音视频文件"""
|
||||||
|
|
||||||
maxkb_logger.info(f"MediaSplitHandle.handle called with file: {file.name}")
|
maxkb_logger.info(f"MediaSplitHandle.handle called with file: {file.name}")
|
||||||
|
maxkb_logger.info(f"Split parameters - limit: {limit}, patterns: {pattern_list}, with_filter: {with_filter}")
|
||||||
|
|
||||||
# 检查是否需要实际处理
|
# 检查是否需要实际处理
|
||||||
use_actual_processing = kwargs.get('use_actual_processing', False)
|
use_actual_processing = kwargs.get('use_actual_processing', False)
|
||||||
@ -38,10 +39,23 @@ class MediaSplitHandle(BaseSplitHandle):
|
|||||||
|
|
||||||
if use_actual_processing and stt_model_id:
|
if use_actual_processing and stt_model_id:
|
||||||
# 进行实际处理
|
# 进行实际处理
|
||||||
return self._handle_actual_processing(file, get_buffer, **kwargs)
|
result = self._handle_actual_processing(file, get_buffer, **kwargs)
|
||||||
|
|
||||||
|
# 应用智能分块(如果需要)
|
||||||
|
chunk_limit = limit if limit > 0 else 1000 # 默认1000字符
|
||||||
|
if len(result.get('content', [])) > 0:
|
||||||
|
result = self._apply_smart_split(result, chunk_limit, with_filter)
|
||||||
|
|
||||||
|
return result
|
||||||
else:
|
else:
|
||||||
# 使用默认文本
|
# 使用默认文本
|
||||||
return self._handle_default_text(file, **kwargs)
|
result = self._handle_default_text(file, **kwargs)
|
||||||
|
|
||||||
|
# 即使是默认文本也应用分块(如果有limit参数)
|
||||||
|
if limit > 0:
|
||||||
|
result = self._apply_smart_split(result, limit, with_filter)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
def _get_audio_default_segments(self, file_name: str) -> List[dict]:
|
def _get_audio_default_segments(self, file_name: str) -> List[dict]:
|
||||||
"""生成音频文件的默认段落"""
|
"""生成音频文件的默认段落"""
|
||||||
@ -297,3 +311,122 @@ class MediaSplitHandle(BaseSplitHandle):
|
|||||||
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
|
return f"{hours:02d}:{minutes:02d}:{secs:02d}"
|
||||||
else:
|
else:
|
||||||
return f"{minutes:02d}:{secs:02d}"
|
return f"{minutes:02d}:{secs:02d}"
|
||||||
|
|
||||||
|
def smart_split_transcription(self, text, limit=1000, overlap=100):
|
||||||
|
"""智能分割转录文本"""
|
||||||
|
if len(text) <= limit:
|
||||||
|
return [text]
|
||||||
|
|
||||||
|
# 预处理:移除过多的空行
|
||||||
|
text = '\n'.join(line for line in text.split('\n') if line.strip())
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
start = 0
|
||||||
|
|
||||||
|
while start < len(text):
|
||||||
|
# 计算当前块的结束位置
|
||||||
|
end = min(start + limit, len(text))
|
||||||
|
|
||||||
|
# 如果不是最后一块,尝试在句号处断开
|
||||||
|
if end < len(text):
|
||||||
|
# 寻找最佳断点
|
||||||
|
best_end = end
|
||||||
|
|
||||||
|
# 优先在句号处断开
|
||||||
|
last_period = text.rfind('。', start, end)
|
||||||
|
if last_period > start + limit // 2:
|
||||||
|
best_end = last_period + 1
|
||||||
|
else:
|
||||||
|
# 其次在换行符处断开
|
||||||
|
last_newline = text.rfind('\n', start, end)
|
||||||
|
if last_newline > start + limit // 3:
|
||||||
|
best_end = last_newline + 1
|
||||||
|
|
||||||
|
end = best_end
|
||||||
|
|
||||||
|
# 提取当前块
|
||||||
|
chunk = text[start:end].strip()
|
||||||
|
if chunk:
|
||||||
|
chunks.append(chunk)
|
||||||
|
|
||||||
|
# 计算下一块的开始位置(考虑重叠)
|
||||||
|
start = max(end - overlap, len(chunks[-1]) if chunks else 0)
|
||||||
|
|
||||||
|
# 避免无限循环
|
||||||
|
if start >= len(text):
|
||||||
|
break
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False):
|
||||||
|
"""应用智能分块到转录结果"""
|
||||||
|
overlap = 100 # 前后重叠字符数
|
||||||
|
|
||||||
|
new_paragraphs = []
|
||||||
|
for paragraph in result.get('content', []):
|
||||||
|
content = paragraph.get('content', '')
|
||||||
|
|
||||||
|
# 应用文本过滤(如果需要)
|
||||||
|
if with_filter:
|
||||||
|
content = self._clean_text(content)
|
||||||
|
|
||||||
|
if content:
|
||||||
|
# 应用智能分块
|
||||||
|
chunks = self.smart_split_transcription(content, limit, overlap)
|
||||||
|
|
||||||
|
# 如果只有一个块且没有变化,保持原样
|
||||||
|
if len(chunks) == 1 and chunks[0] == content:
|
||||||
|
new_paragraphs.append(paragraph)
|
||||||
|
else:
|
||||||
|
# 创建新的段落
|
||||||
|
for idx, chunk in enumerate(chunks):
|
||||||
|
# 保留原始元数据,但更新分段相关信息
|
||||||
|
metadata = paragraph.get('metadata', {}).copy()
|
||||||
|
metadata.update({
|
||||||
|
'chunk_index': idx,
|
||||||
|
'total_chunks': len(chunks),
|
||||||
|
'split_method': 'smart_transcription',
|
||||||
|
'split_limit': limit,
|
||||||
|
'split_overlap': overlap,
|
||||||
|
'with_filter': with_filter
|
||||||
|
})
|
||||||
|
|
||||||
|
new_paragraph = {
|
||||||
|
'content': chunk,
|
||||||
|
'title': f"{paragraph.get('title', '段落')} - 第{idx + 1}部分" if len(chunks) > 1 else paragraph.get('title', '段落'),
|
||||||
|
'metadata': metadata
|
||||||
|
}
|
||||||
|
new_paragraphs.append(new_paragraph)
|
||||||
|
else:
|
||||||
|
new_paragraphs.append(paragraph)
|
||||||
|
|
||||||
|
# 更新结果
|
||||||
|
result['content'] = new_paragraphs
|
||||||
|
|
||||||
|
# 更新元数据
|
||||||
|
metadata = result.get('metadata', {})
|
||||||
|
metadata['smart_split_applied'] = True
|
||||||
|
metadata['total_chunks'] = len(new_paragraphs)
|
||||||
|
result['metadata'] = metadata
|
||||||
|
|
||||||
|
maxkb_logger.info(f"Applied smart transcription split: {len(new_paragraphs)} chunks")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _clean_text(self, text):
|
||||||
|
"""清理文本:去掉重复多余符号空格、空行、制表符"""
|
||||||
|
import re
|
||||||
|
|
||||||
|
# 移除多余的空白字符
|
||||||
|
text = re.sub(r'\s+', ' ', text)
|
||||||
|
|
||||||
|
# 移除开头和结尾的空白
|
||||||
|
text = text.strip()
|
||||||
|
|
||||||
|
# 移除重复的标点符号
|
||||||
|
text = re.sub(r'([。!?,])\1+', r'\1', text)
|
||||||
|
|
||||||
|
# 移除多余的换行
|
||||||
|
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||||
|
|
||||||
|
return text
|
||||||
@ -249,15 +249,33 @@ def submit_advanced_learning_task(document_id, knowledge_id, workspace_id, llm_m
|
|||||||
def submit_media_learning_task(document_id, knowledge_id, workspace_id, stt_model_id, llm_model_id):
|
def submit_media_learning_task(document_id, knowledge_id, workspace_id, stt_model_id, llm_model_id):
|
||||||
"""提交音视频学习任务的辅助函数"""
|
"""提交音视频学习任务的辅助函数"""
|
||||||
try:
|
try:
|
||||||
|
# 获取文档实例以访问分块参数
|
||||||
|
from knowledge.models import Document
|
||||||
|
document = QuerySet(Document).filter(id=document_id).first()
|
||||||
|
|
||||||
|
# 获取分块参数
|
||||||
|
limit = None
|
||||||
|
patterns = None
|
||||||
|
with_filter = None
|
||||||
|
|
||||||
|
if document and document.meta:
|
||||||
|
limit = document.meta.get('split_limit')
|
||||||
|
patterns = document.meta.get('split_patterns', [])
|
||||||
|
with_filter = document.meta.get('split_with_filter')
|
||||||
|
|
||||||
from knowledge.tasks.media_learning import media_learning_by_document
|
from knowledge.tasks.media_learning import media_learning_by_document
|
||||||
media_learning_by_document.delay(
|
media_learning_by_document.delay(
|
||||||
document_id,
|
document_id,
|
||||||
knowledge_id,
|
knowledge_id,
|
||||||
workspace_id,
|
workspace_id,
|
||||||
stt_model_id,
|
stt_model_id,
|
||||||
llm_model_id
|
llm_model_id,
|
||||||
|
limit=limit,
|
||||||
|
patterns=patterns,
|
||||||
|
with_filter=with_filter
|
||||||
)
|
)
|
||||||
maxkb_logger.info(f"Media learning task submitted for document: {document_id}")
|
maxkb_logger.info(f"Media learning task submitted for document: {document_id}")
|
||||||
|
maxkb_logger.info(f"Split parameters - limit: {limit}, patterns: {patterns}, with_filter: {with_filter}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
maxkb_logger.error(f"Failed to submit media learning task: {str(e)}")
|
maxkb_logger.error(f"Failed to submit media learning task: {str(e)}")
|
||||||
ListenerManagement.update_status(
|
ListenerManagement.update_status(
|
||||||
|
|||||||
@ -17,7 +17,10 @@ from common.handle.impl.media.media_split_handle import MediaSplitHandle
|
|||||||
|
|
||||||
@shared_task(name='media_learning_by_document')
|
@shared_task(name='media_learning_by_document')
|
||||||
def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id: str,
|
def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id: str,
|
||||||
stt_model_id: str, llm_model_id: Optional[str] = None):
|
stt_model_id: str, llm_model_id: Optional[str] = None,
|
||||||
|
limit: Optional[int] = None,
|
||||||
|
patterns: Optional[List[str]] = None,
|
||||||
|
with_filter: Optional[bool] = None):
|
||||||
"""
|
"""
|
||||||
音视频文档异步处理任务 - 完整状态流转
|
音视频文档异步处理任务 - 完整状态流转
|
||||||
|
|
||||||
@ -34,9 +37,13 @@ def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id
|
|||||||
workspace_id: 工作空间ID
|
workspace_id: 工作空间ID
|
||||||
stt_model_id: STT模型ID
|
stt_model_id: STT模型ID
|
||||||
llm_model_id: LLM模型ID(可选)
|
llm_model_id: LLM模型ID(可选)
|
||||||
|
limit: 分段长度(可选)
|
||||||
|
patterns: 分段正则列表(可选)
|
||||||
|
with_filter: 是否清除特殊字符(可选)
|
||||||
"""
|
"""
|
||||||
maxkb_logger.info(f"🎬 Starting media learning task for document: {document_id}")
|
maxkb_logger.info(f"🎬 Starting media learning task for document: {document_id}")
|
||||||
maxkb_logger.info(f"📋 Current status: PENDING (排队中)")
|
maxkb_logger.info(f"📋 Current status: PENDING (排队中)")
|
||||||
|
maxkb_logger.info(f"📝 Split parameters - limit: {limit}, patterns: {patterns}, with_filter: {with_filter}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 验证文档存在
|
# 验证文档存在
|
||||||
@ -81,12 +88,12 @@ def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id
|
|||||||
def get_buffer(file_obj):
|
def get_buffer(file_obj):
|
||||||
return file_obj.read()
|
return file_obj.read()
|
||||||
|
|
||||||
# 处理音视频文件(禁用默认文本模式)
|
# 处理音视频文件(传递分块参数)
|
||||||
result = handler.handle(
|
result = handler.handle(
|
||||||
file=temp_file,
|
file=temp_file,
|
||||||
pattern_list=[],
|
pattern_list=patterns or [], # 使用传入的分段模式
|
||||||
with_filter=False,
|
with_filter=with_filter if with_filter is not None else False,
|
||||||
limit=0, # 不限制段落数量
|
limit=limit if limit is not None else 0, # 0表示使用默认值(在handle中会转为1000)
|
||||||
get_buffer=get_buffer,
|
get_buffer=get_buffer,
|
||||||
save_image=False,
|
save_image=False,
|
||||||
stt_model_id=stt_model_id,
|
stt_model_id=stt_model_id,
|
||||||
@ -180,7 +187,10 @@ def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id
|
|||||||
|
|
||||||
@shared_task(name='media_learning_batch')
|
@shared_task(name='media_learning_batch')
|
||||||
def media_learning_batch(document_id_list: List[str], knowledge_id: str, workspace_id: str,
|
def media_learning_batch(document_id_list: List[str], knowledge_id: str, workspace_id: str,
|
||||||
stt_model_id: str, llm_model_id: Optional[str] = None):
|
stt_model_id: str, llm_model_id: Optional[str] = None,
|
||||||
|
limit: Optional[int] = None,
|
||||||
|
patterns: Optional[List[str]] = None,
|
||||||
|
with_filter: Optional[bool] = None):
|
||||||
"""
|
"""
|
||||||
批量音视频处理任务
|
批量音视频处理任务
|
||||||
|
|
||||||
@ -190,6 +200,9 @@ def media_learning_batch(document_id_list: List[str], knowledge_id: str, workspa
|
|||||||
workspace_id: 工作空间ID
|
workspace_id: 工作空间ID
|
||||||
stt_model_id: STT模型ID
|
stt_model_id: STT模型ID
|
||||||
llm_model_id: LLM模型ID(可选)
|
llm_model_id: LLM模型ID(可选)
|
||||||
|
limit: 分段长度(可选)
|
||||||
|
patterns: 分段正则列表(可选)
|
||||||
|
with_filter: 是否清除特殊字符(可选)
|
||||||
"""
|
"""
|
||||||
maxkb_logger.info(f"🎬 Starting batch media learning for {len(document_id_list)} documents")
|
maxkb_logger.info(f"🎬 Starting batch media learning for {len(document_id_list)} documents")
|
||||||
|
|
||||||
@ -197,7 +210,8 @@ def media_learning_batch(document_id_list: List[str], knowledge_id: str, workspa
|
|||||||
for document_id in document_id_list:
|
for document_id in document_id_list:
|
||||||
try:
|
try:
|
||||||
media_learning_by_document.delay(
|
media_learning_by_document.delay(
|
||||||
document_id, knowledge_id, workspace_id, stt_model_id, llm_model_id
|
document_id, knowledge_id, workspace_id, stt_model_id, llm_model_id,
|
||||||
|
limit, patterns, with_filter
|
||||||
)
|
)
|
||||||
maxkb_logger.info(f"📋 Submitted media learning task for document: {document_id}")
|
maxkb_logger.info(f"📋 Submitted media learning task for document: {document_id}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user