add logs
This commit is contained in:
parent
c80eebd6c0
commit
50166f012f
@ -335,89 +335,39 @@ class MediaSplitHandle(BaseSplitHandle):
|
||||
return f"{minutes:02d}:{secs:02d}"
|
||||
|
||||
def smart_split_transcription(self, text, limit=1000, overlap=100):
|
||||
"""智能分割转录文本"""
|
||||
"""简单分割转录文本 - 按字符切割,保留重叠"""
|
||||
maxkb_logger.info(f"🔧 smart_split_transcription called - text length: {len(text)}, limit: {limit}, overlap: {overlap}")
|
||||
|
||||
if len(text) <= limit:
|
||||
maxkb_logger.info(f"✅ Text length {len(text)} <= limit {limit}, no splitting needed")
|
||||
return [text]
|
||||
|
||||
# 预处理:移除过多的空行
|
||||
original_length = len(text)
|
||||
text = '\n'.join(line for line in text.split('\n') if line.strip())
|
||||
cleaned_length = len(text)
|
||||
maxkb_logger.info(f"🧹 Text preprocessing: {original_length} -> {cleaned_length} characters")
|
||||
text_len = len(text)
|
||||
step = limit - overlap # 每次前进的步长
|
||||
|
||||
# 直接计算所有chunk
|
||||
chunk_count = (text_len - overlap) // step + 1
|
||||
chunks = []
|
||||
start = 0
|
||||
iteration = 0
|
||||
max_iterations = len(text) // (limit // 2) + 10 # 防止无限循环
|
||||
|
||||
while start < len(text) and iteration < max_iterations:
|
||||
iteration += 1
|
||||
maxkb_logger.info(f"🔄 Iteration {iteration}: start={start}, text_length={len(text)}")
|
||||
for i in range(chunk_count):
|
||||
start = i * step
|
||||
end = min(start + limit, text_len)
|
||||
|
||||
# 计算当前块的结束位置
|
||||
end = min(start + limit, len(text))
|
||||
maxkb_logger.info(f"📍 Initial end position: {end}")
|
||||
|
||||
# 如果不是最后一块,尝试在句号处断开
|
||||
if end < len(text):
|
||||
# 寻找最佳断点
|
||||
best_end = end
|
||||
|
||||
# 优先在句号处断开
|
||||
last_period = text.rfind('。', start, end)
|
||||
maxkb_logger.info(f"🔍 Last period position: {last_period}")
|
||||
|
||||
if last_period > start + limit // 2:
|
||||
best_end = last_period + 1
|
||||
maxkb_logger.info(f"✂️ Splitting at period: position {best_end}")
|
||||
else:
|
||||
# 其次在换行符处断开
|
||||
last_newline = text.rfind('\n', start, end)
|
||||
maxkb_logger.info(f"🔍 Last newline position: {last_newline}")
|
||||
|
||||
if last_newline > start + limit // 3:
|
||||
best_end = last_newline + 1
|
||||
maxkb_logger.info(f"✂️ Splitting at newline: position {best_end}")
|
||||
else:
|
||||
maxkb_logger.info(f"⚠️ No good split point found, using default position {end}")
|
||||
|
||||
end = best_end
|
||||
|
||||
# 提取当前块
|
||||
chunk = text[start:end].strip()
|
||||
chunk_length = len(chunk)
|
||||
maxkb_logger.info(f"📦 Chunk {len(chunks)+1}: length={chunk_length}, start={start}, end={end}")
|
||||
maxkb_logger.info(f"📝 Chunk preview: '{chunk[:100]}...'")
|
||||
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
else:
|
||||
maxkb_logger.warning(f"⚠️ Empty chunk created, skipping")
|
||||
maxkb_logger.info(f"📦 Chunk {i+1}: length={len(chunk)}, positions=({start}, {end})")
|
||||
|
||||
# 计算下一块的开始位置(考虑重叠)
|
||||
# 修复:确保start总是增加的
|
||||
old_start = start
|
||||
start = max(end - overlap, end - chunk_length // 2)
|
||||
# 处理最后可能遗漏的部分
|
||||
if chunks:
|
||||
last_end = chunks[-1] and (len(chunks) - 1) * step + limit or 0
|
||||
if last_end < text_len:
|
||||
chunk = text[last_end:].strip()
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
maxkb_logger.info(f"📦 Final chunk: length={len(chunk)}")
|
||||
|
||||
# 确保至少前进1个字符
|
||||
if start <= old_start:
|
||||
start = old_start + 1
|
||||
maxkb_logger.warning(f"⚠️ Adjusting start position to prevent infinite loop: {start}")
|
||||
|
||||
maxkb_logger.info(f"➡️ Next start position: {start}")
|
||||
|
||||
# 避免无限循环
|
||||
if start >= len(text):
|
||||
maxkb_logger.info(f"🏁 Reached end of text")
|
||||
break
|
||||
|
||||
if iteration >= max_iterations:
|
||||
maxkb_logger.error(f"❌ Max iterations ({max_iterations}) reached, potential infinite loop")
|
||||
|
||||
maxkb_logger.info(f"✅ Split completed: {len(chunks)} chunks created in {iteration} iterations")
|
||||
maxkb_logger.info(f"✅ Split completed: {len(chunks)} chunks created")
|
||||
return chunks
|
||||
|
||||
def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user