修复分段逻辑

2025-12-19 12:34:22 +08:00 · 2025-12-19 12:34:22 +08:00 · 545402617b
commit 545402617b
parent 653ee4af13
2 changed files with 92 additions and 29 deletions
--- a/apps/common/handle/impl/media/media_split_handle.py
+++ b/apps/common/handle/impl/media/media_split_handle.py
@ -334,49 +334,35 @@ class MediaSplitHandle(BaseSplitHandle):
            return f"{minutes:02d}:{secs:02d}"

    def smart_split_transcription(self, text, limit=1000, overlap=100):
-        """智能分割转录文本"""
+        """智能分割转录文本 - 简化版本"""
+        # 处理边界情况
        if len(text) <= limit:
            return [text]

-        # 预处理：移除过多的空行
-        text = '\n'.join(line for line in text.split('\n') if line.strip())
+        # 确保参数合理性
+        if overlap >= limit:
+            overlap = limit // 2

        chunks = []
        start = 0
+        step_size = limit - overlap  # 每次前进的距离

        while start < len(text):
            # 计算当前块的结束位置
            end = min(start + limit, len(text))

-            # 如果不是最后一块，尝试在句号处断开
-            if end < len(text):
-                # 寻找最佳断点
-                best_end = end
-
-                # 优先在句号处断开
-                last_period = text.rfind('。', start, end)
-                if last_period > start + limit // 2:
-                    best_end = last_period + 1
-                else:
-                    # 其次在换行符处断开
-                    last_newline = text.rfind('\n', start, end)
-                    if last_newline > start + limit // 3:
-                        best_end = last_newline + 1
-
-                end = best_end
-
-            # 提取当前块
-            chunk = text[start:end].strip()
-            if chunk:
+            # 提取当前块（不修改格式）
+            chunk = text[start:end]
            chunks.append(chunk)

-            # 计算下一块的开始位置（考虑重叠）
-            start = max(end - overlap, len(chunks[-1]) if chunks else 0)
-
-            # 避免无限循环
-            if start >= len(text):
+            # 计算下一块的开始位置
+            # 如果到了末尾，就退出
+            if end >= len(text):
                break

+            # 前进step_size，确保总是有进展
+            start += step_size
+
        return chunks

    def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False):
--- a/test_smart_split.py
+++ b/test_smart_split.py
@ -0,0 +1,77 @@
+#!/usr/bin/env python
+"""测试优化后的 smart_split_transcription 函数"""
+
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+from apps.common.handle.impl.media.media_split_handle import MediaSplitHandle
+
+def test_smart_split():
+    handler = MediaSplitHandle()
+
+    print("测试1: 短文本（≤ limit）")
+    short_text = "这是一个短文本"
+    result = handler.smart_split_transcription(short_text, limit=1000, overlap=100)
+    print(f"输入长度: {len(short_text)}, 输出块数: {len(result)}")
+    print(f"结果: {result}")
+    print()
+
+    print("测试2: 长文本（> limit）")
+    long_text = "这是一段很长的文本。" * 200  # 约1200字符
+    result = handler.smart_split_transcription(long_text, limit=1000, overlap=100)
+    print(f"输入长度: {len(long_text)}, 输出块数: {len(result)}")
+    print(f"第一块长度: {len(result[0])}")
+    print(f"最后一块长度: {len(result[-1])}")
+    print()
+
+    print("测试3: 验证重叠")
+    # 创建有明显标识的文本
+    base = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    long_text = base * 50  # 1300字符
+    result = handler.smart_split_transcription(long_text, limit=500, overlap=100)
+    print(f"输入长度: {len(long_text)}, 输出块数: {len(result)}")
+
+    # 检查重叠
+    if len(result) > 1:
+        end_of_first = result[0][-50:]
+        start_of_second = result[1][:50]
+        print(f"\n第一块末尾50字符: {end_of_first}")
+        print(f"第二块开头50字符: {start_of_second}")
+
+        # 找出重叠部分
+        overlap_len = 0
+        max_possible = min(len(end_of_first), len(start_of_second))
+        for i in range(max_possible):
+            if end_of_first[-(i+1):] == start_of_second[:i+1]:
+                overlap_len = i + 1
+        print(f"检测到的重叠长度: {overlap_len}")
+    print()
+
+    print("测试4: 边界情况 - overlap >= limit")
+    text = "测试文本" * 100
+    result = handler.smart_split_transcription(text, limit=200, overlap=200)
+    print(f"输入长度: {len(text)}, limit=200, overlap=200")
+    print(f"输出块数: {len(result)}")
+    for i, chunk in enumerate(result):
+        print(f"块{i+1}长度: {len(chunk)}")
+    print()
+
+    print("测试5: 包含换行符的文本")
+    text_with_newlines = "第一行\n第二行\n第三行\n" * 100
+    result = handler.smart_split_transcription(text_with_newlines, limit=500, overlap=50)
+    print(f"输入长度: {len(text_with_newlines)}, 输出块数: {len(result)}")
+    print(f"第一块是否保留换行符: {'\\n' in result[0]}")
+    print()
+
+    print("测试6: 极端情况 - 空文本")
+    result = handler.smart_split_transcription("", limit=1000, overlap=100)
+    print(f"空文本输入，输出: {result}")
+    print()
+
+    print("测试7: 极端情况 - 单个字符")
+    result = handler.smart_split_transcription("A", limit=1000, overlap=100)
+    print(f"单字符输入，输出: {result}")
+
+if __name__ == "__main__":
+    test_smart_split()