diff --git a/apps/common/handle/impl/media/media_split_handle.py b/apps/common/handle/impl/media/media_split_handle.py index d584429f..ad84392a 100644 --- a/apps/common/handle/impl/media/media_split_handle.py +++ b/apps/common/handle/impl/media/media_split_handle.py @@ -335,89 +335,39 @@ class MediaSplitHandle(BaseSplitHandle): return f"{minutes:02d}:{secs:02d}" def smart_split_transcription(self, text, limit=1000, overlap=100): - """智能分割转录文本""" + """简单分割转录文本 - 按字符切割,保留重叠""" maxkb_logger.info(f"🔧 smart_split_transcription called - text length: {len(text)}, limit: {limit}, overlap: {overlap}") if len(text) <= limit: maxkb_logger.info(f"✅ Text length {len(text)} <= limit {limit}, no splitting needed") return [text] - # 预处理:移除过多的空行 - original_length = len(text) - text = '\n'.join(line for line in text.split('\n') if line.strip()) - cleaned_length = len(text) - maxkb_logger.info(f"🧹 Text preprocessing: {original_length} -> {cleaned_length} characters") + text_len = len(text) + step = limit - overlap # 每次前进的步长 + # 直接计算所有chunk + chunk_count = (text_len - overlap) // step + 1 chunks = [] - start = 0 - iteration = 0 - max_iterations = len(text) // (limit // 2) + 10 # 防止无限循环 - while start < len(text) and iteration < max_iterations: - iteration += 1 - maxkb_logger.info(f"🔄 Iteration {iteration}: start={start}, text_length={len(text)}") + for i in range(chunk_count): + start = i * step + end = min(start + limit, text_len) - # 计算当前块的结束位置 - end = min(start + limit, len(text)) - maxkb_logger.info(f"📍 Initial end position: {end}") - - # 如果不是最后一块,尝试在句号处断开 - if end < len(text): - # 寻找最佳断点 - best_end = end - - # 优先在句号处断开 - last_period = text.rfind('。', start, end) - maxkb_logger.info(f"🔍 Last period position: {last_period}") - - if last_period > start + limit // 2: - best_end = last_period + 1 - maxkb_logger.info(f"✂️ Splitting at period: position {best_end}") - else: - # 其次在换行符处断开 - last_newline = text.rfind('\n', start, end) - maxkb_logger.info(f"🔍 Last newline position: {last_newline}") - - if last_newline > start + limit // 3: - best_end = last_newline + 1 - maxkb_logger.info(f"✂️ Splitting at newline: position {best_end}") - else: - maxkb_logger.info(f"⚠️ No good split point found, using default position {end}") - - end = best_end - - # 提取当前块 chunk = text[start:end].strip() - chunk_length = len(chunk) - maxkb_logger.info(f"📦 Chunk {len(chunks)+1}: length={chunk_length}, start={start}, end={end}") - maxkb_logger.info(f"📝 Chunk preview: '{chunk[:100]}...'") - if chunk: chunks.append(chunk) - else: - maxkb_logger.warning(f"⚠️ Empty chunk created, skipping") + maxkb_logger.info(f"📦 Chunk {i+1}: length={len(chunk)}, positions=({start}, {end})") - # 计算下一块的开始位置(考虑重叠) - # 修复:确保start总是增加的 - old_start = start - start = max(end - overlap, end - chunk_length // 2) + # 处理最后可能遗漏的部分 + if chunks: + last_end = chunks[-1] and (len(chunks) - 1) * step + limit or 0 + if last_end < text_len: + chunk = text[last_end:].strip() + if chunk: + chunks.append(chunk) + maxkb_logger.info(f"📦 Final chunk: length={len(chunk)}") - # 确保至少前进1个字符 - if start <= old_start: - start = old_start + 1 - maxkb_logger.warning(f"⚠️ Adjusting start position to prevent infinite loop: {start}") - - maxkb_logger.info(f"➡️ Next start position: {start}") - - # 避免无限循环 - if start >= len(text): - maxkb_logger.info(f"🏁 Reached end of text") - break - - if iteration >= max_iterations: - maxkb_logger.error(f"❌ Max iterations ({max_iterations}) reached, potential infinite loop") - - maxkb_logger.info(f"✅ Split completed: {len(chunks)} chunks created in {iteration} iterations") + maxkb_logger.info(f"✅ Split completed: {len(chunks)} chunks created") return chunks def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False):