maxkb/test_smart_split.py
朱潮 545402617b
Some checks are pending
sync2gitee / repo-sync (push) Waiting to run
Typos Check / Spell Check with Typos (push) Waiting to run
修复分段逻辑
2025-12-19 12:34:22 +08:00

77 lines
2.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
"""测试优化后的 smart_split_transcription 函数"""
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from apps.common.handle.impl.media.media_split_handle import MediaSplitHandle
def test_smart_split():
handler = MediaSplitHandle()
print("测试1: 短文本(≤ limit")
short_text = "这是一个短文本"
result = handler.smart_split_transcription(short_text, limit=1000, overlap=100)
print(f"输入长度: {len(short_text)}, 输出块数: {len(result)}")
print(f"结果: {result}")
print()
print("测试2: 长文本(> limit")
long_text = "这是一段很长的文本。" * 200 # 约1200字符
result = handler.smart_split_transcription(long_text, limit=1000, overlap=100)
print(f"输入长度: {len(long_text)}, 输出块数: {len(result)}")
print(f"第一块长度: {len(result[0])}")
print(f"最后一块长度: {len(result[-1])}")
print()
print("测试3: 验证重叠")
# 创建有明显标识的文本
base = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
long_text = base * 50 # 1300字符
result = handler.smart_split_transcription(long_text, limit=500, overlap=100)
print(f"输入长度: {len(long_text)}, 输出块数: {len(result)}")
# 检查重叠
if len(result) > 1:
end_of_first = result[0][-50:]
start_of_second = result[1][:50]
print(f"\n第一块末尾50字符: {end_of_first}")
print(f"第二块开头50字符: {start_of_second}")
# 找出重叠部分
overlap_len = 0
max_possible = min(len(end_of_first), len(start_of_second))
for i in range(max_possible):
if end_of_first[-(i+1):] == start_of_second[:i+1]:
overlap_len = i + 1
print(f"检测到的重叠长度: {overlap_len}")
print()
print("测试4: 边界情况 - overlap >= limit")
text = "测试文本" * 100
result = handler.smart_split_transcription(text, limit=200, overlap=200)
print(f"输入长度: {len(text)}, limit=200, overlap=200")
print(f"输出块数: {len(result)}")
for i, chunk in enumerate(result):
print(f"{i+1}长度: {len(chunk)}")
print()
print("测试5: 包含换行符的文本")
text_with_newlines = "第一行\n第二行\n第三行\n" * 100
result = handler.smart_split_transcription(text_with_newlines, limit=500, overlap=50)
print(f"输入长度: {len(text_with_newlines)}, 输出块数: {len(result)}")
print(f"第一块是否保留换行符: {'\\n' in result[0]}")
print()
print("测试6: 极端情况 - 空文本")
result = handler.smart_split_transcription("", limit=1000, overlap=100)
print(f"空文本输入,输出: {result}")
print()
print("测试7: 极端情况 - 单个字符")
result = handler.smart_split_transcription("A", limit=1000, overlap=100)
print(f"单字符输入,输出: {result}")
if __name__ == "__main__":
test_smart_split()