add logs
Some checks are pending
sync2gitee / repo-sync (push) Waiting to run
Typos Check / Spell Check with Typos (push) Waiting to run

This commit is contained in:
朱潮 2025-12-19 11:34:42 +08:00
parent c80eebd6c0
commit 50166f012f

View File

@ -335,89 +335,39 @@ class MediaSplitHandle(BaseSplitHandle):
return f"{minutes:02d}:{secs:02d}"
def smart_split_transcription(self, text, limit=1000, overlap=100):
"""智能分割转录文本"""
"""简单分割转录文本 - 按字符切割,保留重叠"""
maxkb_logger.info(f"🔧 smart_split_transcription called - text length: {len(text)}, limit: {limit}, overlap: {overlap}")
if len(text) <= limit:
maxkb_logger.info(f"✅ Text length {len(text)} <= limit {limit}, no splitting needed")
return [text]
# 预处理:移除过多的空行
original_length = len(text)
text = '\n'.join(line for line in text.split('\n') if line.strip())
cleaned_length = len(text)
maxkb_logger.info(f"🧹 Text preprocessing: {original_length} -> {cleaned_length} characters")
text_len = len(text)
step = limit - overlap # 每次前进的步长
# 直接计算所有chunk
chunk_count = (text_len - overlap) // step + 1
chunks = []
start = 0
iteration = 0
max_iterations = len(text) // (limit // 2) + 10 # 防止无限循环
while start < len(text) and iteration < max_iterations:
iteration += 1
maxkb_logger.info(f"🔄 Iteration {iteration}: start={start}, text_length={len(text)}")
for i in range(chunk_count):
start = i * step
end = min(start + limit, text_len)
# 计算当前块的结束位置
end = min(start + limit, len(text))
maxkb_logger.info(f"📍 Initial end position: {end}")
# 如果不是最后一块,尝试在句号处断开
if end < len(text):
# 寻找最佳断点
best_end = end
# 优先在句号处断开
last_period = text.rfind('', start, end)
maxkb_logger.info(f"🔍 Last period position: {last_period}")
if last_period > start + limit // 2:
best_end = last_period + 1
maxkb_logger.info(f"✂️ Splitting at period: position {best_end}")
else:
# 其次在换行符处断开
last_newline = text.rfind('\n', start, end)
maxkb_logger.info(f"🔍 Last newline position: {last_newline}")
if last_newline > start + limit // 3:
best_end = last_newline + 1
maxkb_logger.info(f"✂️ Splitting at newline: position {best_end}")
else:
maxkb_logger.info(f"⚠️ No good split point found, using default position {end}")
end = best_end
# 提取当前块
chunk = text[start:end].strip()
chunk_length = len(chunk)
maxkb_logger.info(f"📦 Chunk {len(chunks)+1}: length={chunk_length}, start={start}, end={end}")
maxkb_logger.info(f"📝 Chunk preview: '{chunk[:100]}...'")
if chunk:
chunks.append(chunk)
else:
maxkb_logger.warning(f"⚠️ Empty chunk created, skipping")
maxkb_logger.info(f"📦 Chunk {i+1}: length={len(chunk)}, positions=({start}, {end})")
# 计算下一块的开始位置(考虑重叠)
# 修复确保start总是增加的
old_start = start
start = max(end - overlap, end - chunk_length // 2)
# 处理最后可能遗漏的部分
if chunks:
last_end = chunks[-1] and (len(chunks) - 1) * step + limit or 0
if last_end < text_len:
chunk = text[last_end:].strip()
if chunk:
chunks.append(chunk)
maxkb_logger.info(f"📦 Final chunk: length={len(chunk)}")
# 确保至少前进1个字符
if start <= old_start:
start = old_start + 1
maxkb_logger.warning(f"⚠️ Adjusting start position to prevent infinite loop: {start}")
maxkb_logger.info(f"➡️ Next start position: {start}")
# 避免无限循环
if start >= len(text):
maxkb_logger.info(f"🏁 Reached end of text")
break
if iteration >= max_iterations:
maxkb_logger.error(f"❌ Max iterations ({max_iterations}) reached, potential infinite loop")
maxkb_logger.info(f"✅ Split completed: {len(chunks)} chunks created in {iteration} iterations")
maxkb_logger.info(f"✅ Split completed: {len(chunks)} chunks created")
return chunks
def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False):