add logs
Some checks are pending
sync2gitee / repo-sync (push) Waiting to run
Typos Check / Spell Check with Typos (push) Waiting to run

This commit is contained in:
朱潮 2025-12-19 11:34:42 +08:00
parent c80eebd6c0
commit 50166f012f

View File

@ -335,89 +335,39 @@ class MediaSplitHandle(BaseSplitHandle):
return f"{minutes:02d}:{secs:02d}" return f"{minutes:02d}:{secs:02d}"
def smart_split_transcription(self, text, limit=1000, overlap=100): def smart_split_transcription(self, text, limit=1000, overlap=100):
"""智能分割转录文本""" """简单分割转录文本 - 按字符切割,保留重叠"""
maxkb_logger.info(f"🔧 smart_split_transcription called - text length: {len(text)}, limit: {limit}, overlap: {overlap}") maxkb_logger.info(f"🔧 smart_split_transcription called - text length: {len(text)}, limit: {limit}, overlap: {overlap}")
if len(text) <= limit: if len(text) <= limit:
maxkb_logger.info(f"✅ Text length {len(text)} <= limit {limit}, no splitting needed") maxkb_logger.info(f"✅ Text length {len(text)} <= limit {limit}, no splitting needed")
return [text] return [text]
# 预处理:移除过多的空行 text_len = len(text)
original_length = len(text) step = limit - overlap # 每次前进的步长
text = '\n'.join(line for line in text.split('\n') if line.strip())
cleaned_length = len(text)
maxkb_logger.info(f"🧹 Text preprocessing: {original_length} -> {cleaned_length} characters")
# 直接计算所有chunk
chunk_count = (text_len - overlap) // step + 1
chunks = [] chunks = []
start = 0
iteration = 0
max_iterations = len(text) // (limit // 2) + 10 # 防止无限循环
while start < len(text) and iteration < max_iterations: for i in range(chunk_count):
iteration += 1 start = i * step
maxkb_logger.info(f"🔄 Iteration {iteration}: start={start}, text_length={len(text)}") end = min(start + limit, text_len)
# 计算当前块的结束位置
end = min(start + limit, len(text))
maxkb_logger.info(f"📍 Initial end position: {end}")
# 如果不是最后一块,尝试在句号处断开
if end < len(text):
# 寻找最佳断点
best_end = end
# 优先在句号处断开
last_period = text.rfind('', start, end)
maxkb_logger.info(f"🔍 Last period position: {last_period}")
if last_period > start + limit // 2:
best_end = last_period + 1
maxkb_logger.info(f"✂️ Splitting at period: position {best_end}")
else:
# 其次在换行符处断开
last_newline = text.rfind('\n', start, end)
maxkb_logger.info(f"🔍 Last newline position: {last_newline}")
if last_newline > start + limit // 3:
best_end = last_newline + 1
maxkb_logger.info(f"✂️ Splitting at newline: position {best_end}")
else:
maxkb_logger.info(f"⚠️ No good split point found, using default position {end}")
end = best_end
# 提取当前块
chunk = text[start:end].strip() chunk = text[start:end].strip()
chunk_length = len(chunk)
maxkb_logger.info(f"📦 Chunk {len(chunks)+1}: length={chunk_length}, start={start}, end={end}")
maxkb_logger.info(f"📝 Chunk preview: '{chunk[:100]}...'")
if chunk: if chunk:
chunks.append(chunk) chunks.append(chunk)
else: maxkb_logger.info(f"📦 Chunk {i+1}: length={len(chunk)}, positions=({start}, {end})")
maxkb_logger.warning(f"⚠️ Empty chunk created, skipping")
# 计算下一块的开始位置(考虑重叠) # 处理最后可能遗漏的部分
# 修复确保start总是增加的 if chunks:
old_start = start last_end = chunks[-1] and (len(chunks) - 1) * step + limit or 0
start = max(end - overlap, end - chunk_length // 2) if last_end < text_len:
chunk = text[last_end:].strip()
if chunk:
chunks.append(chunk)
maxkb_logger.info(f"📦 Final chunk: length={len(chunk)}")
# 确保至少前进1个字符 maxkb_logger.info(f"✅ Split completed: {len(chunks)} chunks created")
if start <= old_start:
start = old_start + 1
maxkb_logger.warning(f"⚠️ Adjusting start position to prevent infinite loop: {start}")
maxkb_logger.info(f"➡️ Next start position: {start}")
# 避免无限循环
if start >= len(text):
maxkb_logger.info(f"🏁 Reached end of text")
break
if iteration >= max_iterations:
maxkb_logger.error(f"❌ Max iterations ({max_iterations}) reached, potential infinite loop")
maxkb_logger.info(f"✅ Split completed: {len(chunks)} chunks created in {iteration} iterations")
return chunks return chunks
def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False): def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False):