From 545402617b62b48b50cbde83a957343966298ba1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= <zhuchaowe@users.noreply.github.com>
Date: Fri, 19 Dec 2025 12:34:22 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=88=86=E6=AE=B5=E9=80=BB?=
 =?UTF-8?q?=E8=BE=91?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../handle/impl/media/media_split_handle.py   | 44 ++++-------
 test_smart_split.py                           | 77 +++++++++++++++++++
 2 files changed, 92 insertions(+), 29 deletions(-)
 create mode 100644 test_smart_split.py

diff --git a/apps/common/handle/impl/media/media_split_handle.py b/apps/common/handle/impl/media/media_split_handle.py
index d1432205..ada65130 100644
--- a/apps/common/handle/impl/media/media_split_handle.py
+++ b/apps/common/handle/impl/media/media_split_handle.py
@@ -334,49 +334,35 @@ class MediaSplitHandle(BaseSplitHandle):
             return f"{minutes:02d}:{secs:02d}"
 
     def smart_split_transcription(self, text, limit=1000, overlap=100):
-        """智能分割转录文本"""
+        """智能分割转录文本 - 简化版本"""
+        # 处理边界情况
         if len(text) <= limit:
             return [text]
 
-        # 预处理：移除过多的空行
-        text = '\n'.join(line for line in text.split('\n') if line.strip())
+        # 确保参数合理性
+        if overlap >= limit:
+            overlap = limit // 2
 
         chunks = []
         start = 0
+        step_size = limit - overlap  # 每次前进的距离
 
         while start < len(text):
             # 计算当前块的结束位置
             end = min(start + limit, len(text))
 
-            # 如果不是最后一块，尝试在句号处断开
-            if end < len(text):
-                # 寻找最佳断点
-                best_end = end
+            # 提取当前块（不修改格式）
+            chunk = text[start:end]
+            chunks.append(chunk)
 
-                # 优先在句号处断开
-                last_period = text.rfind('。', start, end)
-                if last_period > start + limit // 2:
-                    best_end = last_period + 1
-                else:
-                    # 其次在换行符处断开
-                    last_newline = text.rfind('\n', start, end)
-                    if last_newline > start + limit // 3:
-                        best_end = last_newline + 1
-
-                end = best_end
-
-            # 提取当前块
-            chunk = text[start:end].strip()
-            if chunk:
-                chunks.append(chunk)
-
-            # 计算下一块的开始位置（考虑重叠）
-            start = max(end - overlap, len(chunks[-1]) if chunks else 0)
-
-            # 避免无限循环
-            if start >= len(text):
+            # 计算下一块的开始位置
+            # 如果到了末尾，就退出
+            if end >= len(text):
                 break
 
+            # 前进step_size，确保总是有进展
+            start += step_size
+
         return chunks
 
     def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False):
diff --git a/test_smart_split.py b/test_smart_split.py
new file mode 100644
index 00000000..41e859e1
--- /dev/null
+++ b/test_smart_split.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python
+"""测试优化后的 smart_split_transcription 函数"""
+
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+from apps.common.handle.impl.media.media_split_handle import MediaSplitHandle
+
+def test_smart_split():
+    handler = MediaSplitHandle()
+
+    print("测试1: 短文本（≤ limit）")
+    short_text = "这是一个短文本"
+    result = handler.smart_split_transcription(short_text, limit=1000, overlap=100)
+    print(f"输入长度: {len(short_text)}, 输出块数: {len(result)}")
+    print(f"结果: {result}")
+    print()
+
+    print("测试2: 长文本（> limit）")
+    long_text = "这是一段很长的文本。" * 200  # 约1200字符
+    result = handler.smart_split_transcription(long_text, limit=1000, overlap=100)
+    print(f"输入长度: {len(long_text)}, 输出块数: {len(result)}")
+    print(f"第一块长度: {len(result[0])}")
+    print(f"最后一块长度: {len(result[-1])}")
+    print()
+
+    print("测试3: 验证重叠")
+    # 创建有明显标识的文本
+    base = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+    long_text = base * 50  # 1300字符
+    result = handler.smart_split_transcription(long_text, limit=500, overlap=100)
+    print(f"输入长度: {len(long_text)}, 输出块数: {len(result)}")
+
+    # 检查重叠
+    if len(result) > 1:
+        end_of_first = result[0][-50:]
+        start_of_second = result[1][:50]
+        print(f"\n第一块末尾50字符: {end_of_first}")
+        print(f"第二块开头50字符: {start_of_second}")
+
+        # 找出重叠部分
+        overlap_len = 0
+        max_possible = min(len(end_of_first), len(start_of_second))
+        for i in range(max_possible):
+            if end_of_first[-(i+1):] == start_of_second[:i+1]:
+                overlap_len = i + 1
+        print(f"检测到的重叠长度: {overlap_len}")
+    print()
+
+    print("测试4: 边界情况 - overlap >= limit")
+    text = "测试文本" * 100
+    result = handler.smart_split_transcription(text, limit=200, overlap=200)
+    print(f"输入长度: {len(text)}, limit=200, overlap=200")
+    print(f"输出块数: {len(result)}")
+    for i, chunk in enumerate(result):
+        print(f"块{i+1}长度: {len(chunk)}")
+    print()
+
+    print("测试5: 包含换行符的文本")
+    text_with_newlines = "第一行\n第二行\n第三行\n" * 100
+    result = handler.smart_split_transcription(text_with_newlines, limit=500, overlap=50)
+    print(f"输入长度: {len(text_with_newlines)}, 输出块数: {len(result)}")
+    print(f"第一块是否保留换行符: {'\\n' in result[0]}")
+    print()
+
+    print("测试6: 极端情况 - 空文本")
+    result = handler.smart_split_transcription("", limit=1000, overlap=100)
+    print(f"空文本输入，输出: {result}")
+    print()
+
+    print("测试7: 极端情况 - 单个字符")
+    result = handler.smart_split_transcription("A", limit=1000, overlap=100)
+    print(f"单字符输入，输出: {result}")
+
+if __name__ == "__main__":
+    test_smart_split()
\ No newline at end of file