add logs
This commit is contained in:
parent
c80eebd6c0
commit
50166f012f
@ -335,89 +335,39 @@ class MediaSplitHandle(BaseSplitHandle):
|
|||||||
return f"{minutes:02d}:{secs:02d}"
|
return f"{minutes:02d}:{secs:02d}"
|
||||||
|
|
||||||
def smart_split_transcription(self, text, limit=1000, overlap=100):
|
def smart_split_transcription(self, text, limit=1000, overlap=100):
|
||||||
"""智能分割转录文本"""
|
"""简单分割转录文本 - 按字符切割,保留重叠"""
|
||||||
maxkb_logger.info(f"🔧 smart_split_transcription called - text length: {len(text)}, limit: {limit}, overlap: {overlap}")
|
maxkb_logger.info(f"🔧 smart_split_transcription called - text length: {len(text)}, limit: {limit}, overlap: {overlap}")
|
||||||
|
|
||||||
if len(text) <= limit:
|
if len(text) <= limit:
|
||||||
maxkb_logger.info(f"✅ Text length {len(text)} <= limit {limit}, no splitting needed")
|
maxkb_logger.info(f"✅ Text length {len(text)} <= limit {limit}, no splitting needed")
|
||||||
return [text]
|
return [text]
|
||||||
|
|
||||||
# 预处理:移除过多的空行
|
text_len = len(text)
|
||||||
original_length = len(text)
|
step = limit - overlap # 每次前进的步长
|
||||||
text = '\n'.join(line for line in text.split('\n') if line.strip())
|
|
||||||
cleaned_length = len(text)
|
|
||||||
maxkb_logger.info(f"🧹 Text preprocessing: {original_length} -> {cleaned_length} characters")
|
|
||||||
|
|
||||||
|
# 直接计算所有chunk
|
||||||
|
chunk_count = (text_len - overlap) // step + 1
|
||||||
chunks = []
|
chunks = []
|
||||||
start = 0
|
|
||||||
iteration = 0
|
|
||||||
max_iterations = len(text) // (limit // 2) + 10 # 防止无限循环
|
|
||||||
|
|
||||||
while start < len(text) and iteration < max_iterations:
|
for i in range(chunk_count):
|
||||||
iteration += 1
|
start = i * step
|
||||||
maxkb_logger.info(f"🔄 Iteration {iteration}: start={start}, text_length={len(text)}")
|
end = min(start + limit, text_len)
|
||||||
|
|
||||||
# 计算当前块的结束位置
|
|
||||||
end = min(start + limit, len(text))
|
|
||||||
maxkb_logger.info(f"📍 Initial end position: {end}")
|
|
||||||
|
|
||||||
# 如果不是最后一块,尝试在句号处断开
|
|
||||||
if end < len(text):
|
|
||||||
# 寻找最佳断点
|
|
||||||
best_end = end
|
|
||||||
|
|
||||||
# 优先在句号处断开
|
|
||||||
last_period = text.rfind('。', start, end)
|
|
||||||
maxkb_logger.info(f"🔍 Last period position: {last_period}")
|
|
||||||
|
|
||||||
if last_period > start + limit // 2:
|
|
||||||
best_end = last_period + 1
|
|
||||||
maxkb_logger.info(f"✂️ Splitting at period: position {best_end}")
|
|
||||||
else:
|
|
||||||
# 其次在换行符处断开
|
|
||||||
last_newline = text.rfind('\n', start, end)
|
|
||||||
maxkb_logger.info(f"🔍 Last newline position: {last_newline}")
|
|
||||||
|
|
||||||
if last_newline > start + limit // 3:
|
|
||||||
best_end = last_newline + 1
|
|
||||||
maxkb_logger.info(f"✂️ Splitting at newline: position {best_end}")
|
|
||||||
else:
|
|
||||||
maxkb_logger.info(f"⚠️ No good split point found, using default position {end}")
|
|
||||||
|
|
||||||
end = best_end
|
|
||||||
|
|
||||||
# 提取当前块
|
|
||||||
chunk = text[start:end].strip()
|
chunk = text[start:end].strip()
|
||||||
chunk_length = len(chunk)
|
|
||||||
maxkb_logger.info(f"📦 Chunk {len(chunks)+1}: length={chunk_length}, start={start}, end={end}")
|
|
||||||
maxkb_logger.info(f"📝 Chunk preview: '{chunk[:100]}...'")
|
|
||||||
|
|
||||||
if chunk:
|
if chunk:
|
||||||
chunks.append(chunk)
|
chunks.append(chunk)
|
||||||
else:
|
maxkb_logger.info(f"📦 Chunk {i+1}: length={len(chunk)}, positions=({start}, {end})")
|
||||||
maxkb_logger.warning(f"⚠️ Empty chunk created, skipping")
|
|
||||||
|
|
||||||
# 计算下一块的开始位置(考虑重叠)
|
# 处理最后可能遗漏的部分
|
||||||
# 修复:确保start总是增加的
|
if chunks:
|
||||||
old_start = start
|
last_end = chunks[-1] and (len(chunks) - 1) * step + limit or 0
|
||||||
start = max(end - overlap, end - chunk_length // 2)
|
if last_end < text_len:
|
||||||
|
chunk = text[last_end:].strip()
|
||||||
|
if chunk:
|
||||||
|
chunks.append(chunk)
|
||||||
|
maxkb_logger.info(f"📦 Final chunk: length={len(chunk)}")
|
||||||
|
|
||||||
# 确保至少前进1个字符
|
maxkb_logger.info(f"✅ Split completed: {len(chunks)} chunks created")
|
||||||
if start <= old_start:
|
|
||||||
start = old_start + 1
|
|
||||||
maxkb_logger.warning(f"⚠️ Adjusting start position to prevent infinite loop: {start}")
|
|
||||||
|
|
||||||
maxkb_logger.info(f"➡️ Next start position: {start}")
|
|
||||||
|
|
||||||
# 避免无限循环
|
|
||||||
if start >= len(text):
|
|
||||||
maxkb_logger.info(f"🏁 Reached end of text")
|
|
||||||
break
|
|
||||||
|
|
||||||
if iteration >= max_iterations:
|
|
||||||
maxkb_logger.error(f"❌ Max iterations ({max_iterations}) reached, potential infinite loop")
|
|
||||||
|
|
||||||
maxkb_logger.info(f"✅ Split completed: {len(chunks)} chunks created in {iteration} iterations")
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False):
|
def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False):
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user