From 545402617b62b48b50cbde83a957343966298ba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= Date: Fri, 19 Dec 2025 12:34:22 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=88=86=E6=AE=B5=E9=80=BB?= =?UTF-8?q?=E8=BE=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../handle/impl/media/media_split_handle.py | 44 ++++------- test_smart_split.py | 77 +++++++++++++++++++ 2 files changed, 92 insertions(+), 29 deletions(-) create mode 100644 test_smart_split.py diff --git a/apps/common/handle/impl/media/media_split_handle.py b/apps/common/handle/impl/media/media_split_handle.py index d1432205..ada65130 100644 --- a/apps/common/handle/impl/media/media_split_handle.py +++ b/apps/common/handle/impl/media/media_split_handle.py @@ -334,49 +334,35 @@ class MediaSplitHandle(BaseSplitHandle): return f"{minutes:02d}:{secs:02d}" def smart_split_transcription(self, text, limit=1000, overlap=100): - """智能分割转录文本""" + """智能分割转录文本 - 简化版本""" + # 处理边界情况 if len(text) <= limit: return [text] - # 预处理:移除过多的空行 - text = '\n'.join(line for line in text.split('\n') if line.strip()) + # 确保参数合理性 + if overlap >= limit: + overlap = limit // 2 chunks = [] start = 0 + step_size = limit - overlap # 每次前进的距离 while start < len(text): # 计算当前块的结束位置 end = min(start + limit, len(text)) - # 如果不是最后一块,尝试在句号处断开 - if end < len(text): - # 寻找最佳断点 - best_end = end + # 提取当前块(不修改格式) + chunk = text[start:end] + chunks.append(chunk) - # 优先在句号处断开 - last_period = text.rfind('。', start, end) - if last_period > start + limit // 2: - best_end = last_period + 1 - else: - # 其次在换行符处断开 - last_newline = text.rfind('\n', start, end) - if last_newline > start + limit // 3: - best_end = last_newline + 1 - - end = best_end - - # 提取当前块 - chunk = text[start:end].strip() - if chunk: - chunks.append(chunk) - - # 计算下一块的开始位置(考虑重叠) - start = max(end - overlap, len(chunks[-1]) if chunks else 0) - - # 避免无限循环 - if start >= len(text): + # 计算下一块的开始位置 + # 如果到了末尾,就退出 + if end >= len(text): break + # 前进step_size,确保总是有进展 + start += step_size + return chunks def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False): diff --git a/test_smart_split.py b/test_smart_split.py new file mode 100644 index 00000000..41e859e1 --- /dev/null +++ b/test_smart_split.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python +"""测试优化后的 smart_split_transcription 函数""" + +import sys +import os +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +from apps.common.handle.impl.media.media_split_handle import MediaSplitHandle + +def test_smart_split(): + handler = MediaSplitHandle() + + print("测试1: 短文本(≤ limit)") + short_text = "这是一个短文本" + result = handler.smart_split_transcription(short_text, limit=1000, overlap=100) + print(f"输入长度: {len(short_text)}, 输出块数: {len(result)}") + print(f"结果: {result}") + print() + + print("测试2: 长文本(> limit)") + long_text = "这是一段很长的文本。" * 200 # 约1200字符 + result = handler.smart_split_transcription(long_text, limit=1000, overlap=100) + print(f"输入长度: {len(long_text)}, 输出块数: {len(result)}") + print(f"第一块长度: {len(result[0])}") + print(f"最后一块长度: {len(result[-1])}") + print() + + print("测试3: 验证重叠") + # 创建有明显标识的文本 + base = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + long_text = base * 50 # 1300字符 + result = handler.smart_split_transcription(long_text, limit=500, overlap=100) + print(f"输入长度: {len(long_text)}, 输出块数: {len(result)}") + + # 检查重叠 + if len(result) > 1: + end_of_first = result[0][-50:] + start_of_second = result[1][:50] + print(f"\n第一块末尾50字符: {end_of_first}") + print(f"第二块开头50字符: {start_of_second}") + + # 找出重叠部分 + overlap_len = 0 + max_possible = min(len(end_of_first), len(start_of_second)) + for i in range(max_possible): + if end_of_first[-(i+1):] == start_of_second[:i+1]: + overlap_len = i + 1 + print(f"检测到的重叠长度: {overlap_len}") + print() + + print("测试4: 边界情况 - overlap >= limit") + text = "测试文本" * 100 + result = handler.smart_split_transcription(text, limit=200, overlap=200) + print(f"输入长度: {len(text)}, limit=200, overlap=200") + print(f"输出块数: {len(result)}") + for i, chunk in enumerate(result): + print(f"块{i+1}长度: {len(chunk)}") + print() + + print("测试5: 包含换行符的文本") + text_with_newlines = "第一行\n第二行\n第三行\n" * 100 + result = handler.smart_split_transcription(text_with_newlines, limit=500, overlap=50) + print(f"输入长度: {len(text_with_newlines)}, 输出块数: {len(result)}") + print(f"第一块是否保留换行符: {'\\n' in result[0]}") + print() + + print("测试6: 极端情况 - 空文本") + result = handler.smart_split_transcription("", limit=1000, overlap=100) + print(f"空文本输入,输出: {result}") + print() + + print("测试7: 极端情况 - 单个字符") + result = handler.smart_split_transcription("A", limit=1000, overlap=100) + print(f"单字符输入,输出: {result}") + +if __name__ == "__main__": + test_smart_split() \ No newline at end of file