add logs
This commit is contained in:
parent
653ee4af13
commit
c80eebd6c0
@ -2,6 +2,7 @@
|
||||
"""
|
||||
音视频处理器 - MaxKB集成层
|
||||
"""
|
||||
import traceback
|
||||
from typing import List
|
||||
from common.handle.base_split_handle import BaseSplitHandle
|
||||
from common.utils.logger import maxkb_logger
|
||||
@ -335,18 +336,30 @@ class MediaSplitHandle(BaseSplitHandle):
|
||||
|
||||
def smart_split_transcription(self, text, limit=1000, overlap=100):
|
||||
"""智能分割转录文本"""
|
||||
maxkb_logger.info(f"🔧 smart_split_transcription called - text length: {len(text)}, limit: {limit}, overlap: {overlap}")
|
||||
|
||||
if len(text) <= limit:
|
||||
maxkb_logger.info(f"✅ Text length {len(text)} <= limit {limit}, no splitting needed")
|
||||
return [text]
|
||||
|
||||
# 预处理:移除过多的空行
|
||||
original_length = len(text)
|
||||
text = '\n'.join(line for line in text.split('\n') if line.strip())
|
||||
cleaned_length = len(text)
|
||||
maxkb_logger.info(f"🧹 Text preprocessing: {original_length} -> {cleaned_length} characters")
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
iteration = 0
|
||||
max_iterations = len(text) // (limit // 2) + 10 # 防止无限循环
|
||||
|
||||
while start < len(text) and iteration < max_iterations:
|
||||
iteration += 1
|
||||
maxkb_logger.info(f"🔄 Iteration {iteration}: start={start}, text_length={len(text)}")
|
||||
|
||||
while start < len(text):
|
||||
# 计算当前块的结束位置
|
||||
end = min(start + limit, len(text))
|
||||
maxkb_logger.info(f"📍 Initial end position: {end}")
|
||||
|
||||
# 如果不是最后一块,尝试在句号处断开
|
||||
if end < len(text):
|
||||
@ -355,39 +368,68 @@ class MediaSplitHandle(BaseSplitHandle):
|
||||
|
||||
# 优先在句号处断开
|
||||
last_period = text.rfind('。', start, end)
|
||||
maxkb_logger.info(f"🔍 Last period position: {last_period}")
|
||||
|
||||
if last_period > start + limit // 2:
|
||||
best_end = last_period + 1
|
||||
maxkb_logger.info(f"✂️ Splitting at period: position {best_end}")
|
||||
else:
|
||||
# 其次在换行符处断开
|
||||
last_newline = text.rfind('\n', start, end)
|
||||
maxkb_logger.info(f"🔍 Last newline position: {last_newline}")
|
||||
|
||||
if last_newline > start + limit // 3:
|
||||
best_end = last_newline + 1
|
||||
maxkb_logger.info(f"✂️ Splitting at newline: position {best_end}")
|
||||
else:
|
||||
maxkb_logger.info(f"⚠️ No good split point found, using default position {end}")
|
||||
|
||||
end = best_end
|
||||
|
||||
# 提取当前块
|
||||
chunk = text[start:end].strip()
|
||||
chunk_length = len(chunk)
|
||||
maxkb_logger.info(f"📦 Chunk {len(chunks)+1}: length={chunk_length}, start={start}, end={end}")
|
||||
maxkb_logger.info(f"📝 Chunk preview: '{chunk[:100]}...'")
|
||||
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
else:
|
||||
maxkb_logger.warning(f"⚠️ Empty chunk created, skipping")
|
||||
|
||||
# 计算下一块的开始位置(考虑重叠)
|
||||
start = max(end - overlap, len(chunks[-1]) if chunks else 0)
|
||||
# 修复:确保start总是增加的
|
||||
old_start = start
|
||||
start = max(end - overlap, end - chunk_length // 2)
|
||||
|
||||
# 确保至少前进1个字符
|
||||
if start <= old_start:
|
||||
start = old_start + 1
|
||||
maxkb_logger.warning(f"⚠️ Adjusting start position to prevent infinite loop: {start}")
|
||||
|
||||
maxkb_logger.info(f"➡️ Next start position: {start}")
|
||||
|
||||
# 避免无限循环
|
||||
if start >= len(text):
|
||||
maxkb_logger.info(f"🏁 Reached end of text")
|
||||
break
|
||||
|
||||
if iteration >= max_iterations:
|
||||
maxkb_logger.error(f"❌ Max iterations ({max_iterations}) reached, potential infinite loop")
|
||||
|
||||
maxkb_logger.info(f"✅ Split completed: {len(chunks)} chunks created in {iteration} iterations")
|
||||
return chunks
|
||||
|
||||
def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False):
|
||||
"""应用智能分块到转录结果"""
|
||||
overlap = 100 # 前后重叠字符数
|
||||
try:
|
||||
overlap = 100 # 前后重叠字符数
|
||||
|
||||
maxkb_logger.info(f"🔧 Starting smart split process - limit: {limit}, with_filter: {with_filter}")
|
||||
maxkb_logger.info(f"🔧 Starting smart split process - limit: {limit}, with_filter: {with_filter}")
|
||||
original_paragraphs = result.get('content', [])
|
||||
maxkb_logger.info(f"📊 Original paragraphs count: {len(original_paragraphs)}")
|
||||
maxkb_logger.info(f"📊 Original paragraphs count: {len(original_paragraphs)}")
|
||||
|
||||
new_paragraphs = []
|
||||
new_paragraphs = []
|
||||
total_chunks_created = 0
|
||||
|
||||
for idx, paragraph in enumerate(original_paragraphs):
|
||||
@ -408,15 +450,17 @@ class MediaSplitHandle(BaseSplitHandle):
|
||||
maxkb_logger.info(f"✂️ Paragraph {idx+1} needs splitting (length {content_length} > limit {limit})")
|
||||
|
||||
# 应用智能分块
|
||||
chunks = self.smart_split_transcription(content, limit, overlap)
|
||||
maxkb_logger.info(f"✂️ Split paragraph {idx+1} into {len(chunks)} chunks")
|
||||
try:
|
||||
maxkb_logger.info(f"🔄 Calling smart_split_transcription for paragraph {idx+1}")
|
||||
chunks = self.smart_split_transcription(content, limit, overlap)
|
||||
maxkb_logger.info(f"✂️ Split paragraph {idx+1} into {len(chunks)} chunks")
|
||||
|
||||
# 记录每个chunk的详细信息
|
||||
for c_idx, chunk in enumerate(chunks):
|
||||
maxkb_logger.info(f"📦 Chunk {c_idx+1}/{len(chunks)}: length={len(chunk)}, preview='{chunk[:50]}...'")
|
||||
# 记录每个chunk的详细信息
|
||||
for c_idx, chunk in enumerate(chunks):
|
||||
maxkb_logger.info(f"📦 Chunk {c_idx+1}/{len(chunks)}: length={len(chunk)}, preview='{chunk[:50]}...'")
|
||||
|
||||
# 创建新的段落
|
||||
for c_idx, chunk in enumerate(chunks):
|
||||
# 创建新的段落
|
||||
for c_idx, chunk in enumerate(chunks):
|
||||
# 保留原始元数据,但更新分段相关信息
|
||||
metadata = paragraph.get('metadata', {}).copy()
|
||||
metadata.update({
|
||||
@ -437,6 +481,14 @@ class MediaSplitHandle(BaseSplitHandle):
|
||||
}
|
||||
new_paragraphs.append(new_paragraph)
|
||||
total_chunks_created += 1
|
||||
|
||||
except Exception as split_error:
|
||||
maxkb_logger.error(f"❌ Error splitting paragraph {idx+1}: {str(split_error)}")
|
||||
maxkb_logger.error(f"🔍 Split error traceback: {traceback.format_exc()}")
|
||||
# 如果分块失败,保留原始段落
|
||||
maxkb_logger.info(f"🔄 Keeping original paragraph due to split error")
|
||||
new_paragraphs.append(paragraph)
|
||||
total_chunks_created += 1
|
||||
else:
|
||||
maxkb_logger.info(f"📄 Paragraph {idx+1} does not need splitting (length {content_length} <= limit {limit})")
|
||||
new_paragraphs.append(paragraph)
|
||||
@ -462,9 +514,16 @@ class MediaSplitHandle(BaseSplitHandle):
|
||||
result['metadata'] = metadata
|
||||
|
||||
maxkb_logger.info(f"✅ Smart split completed - original: {len(original_paragraphs)} paragraphs, final: {len(new_paragraphs)} chunks")
|
||||
maxkb_logger.info(f"📈 Total chunks created: {total_chunks_created}")
|
||||
maxkb_logger.info(f"📈 Total chunks created: {total_chunks_created}")
|
||||
|
||||
return result
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
maxkb_logger.error(f"❌ Fatal error in _apply_smart_split: {str(e)}")
|
||||
maxkb_logger.error(f"🔍 Full error traceback: {traceback.format_exc()}")
|
||||
# 返回原始结果,不做处理
|
||||
maxkb_logger.info(f"🔄 Returning original result without splitting")
|
||||
return result
|
||||
|
||||
def _clean_text(self, text):
|
||||
"""清理文本:去掉重复多余符号空格、空行、制表符"""
|
||||
|
||||
Loading…
Reference in New Issue
Block a user