77 lines
2.9 KiB
Python
77 lines
2.9 KiB
Python
#!/usr/bin/env python
|
||
"""测试优化后的 smart_split_transcription 函数"""
|
||
|
||
import sys
|
||
import os
|
||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||
|
||
from apps.common.handle.impl.media.media_split_handle import MediaSplitHandle
|
||
|
||
def test_smart_split():
|
||
handler = MediaSplitHandle()
|
||
|
||
print("测试1: 短文本(≤ limit)")
|
||
short_text = "这是一个短文本"
|
||
result = handler.smart_split_transcription(short_text, limit=1000, overlap=100)
|
||
print(f"输入长度: {len(short_text)}, 输出块数: {len(result)}")
|
||
print(f"结果: {result}")
|
||
print()
|
||
|
||
print("测试2: 长文本(> limit)")
|
||
long_text = "这是一段很长的文本。" * 200 # 约1200字符
|
||
result = handler.smart_split_transcription(long_text, limit=1000, overlap=100)
|
||
print(f"输入长度: {len(long_text)}, 输出块数: {len(result)}")
|
||
print(f"第一块长度: {len(result[0])}")
|
||
print(f"最后一块长度: {len(result[-1])}")
|
||
print()
|
||
|
||
print("测试3: 验证重叠")
|
||
# 创建有明显标识的文本
|
||
base = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||
long_text = base * 50 # 1300字符
|
||
result = handler.smart_split_transcription(long_text, limit=500, overlap=100)
|
||
print(f"输入长度: {len(long_text)}, 输出块数: {len(result)}")
|
||
|
||
# 检查重叠
|
||
if len(result) > 1:
|
||
end_of_first = result[0][-50:]
|
||
start_of_second = result[1][:50]
|
||
print(f"\n第一块末尾50字符: {end_of_first}")
|
||
print(f"第二块开头50字符: {start_of_second}")
|
||
|
||
# 找出重叠部分
|
||
overlap_len = 0
|
||
max_possible = min(len(end_of_first), len(start_of_second))
|
||
for i in range(max_possible):
|
||
if end_of_first[-(i+1):] == start_of_second[:i+1]:
|
||
overlap_len = i + 1
|
||
print(f"检测到的重叠长度: {overlap_len}")
|
||
print()
|
||
|
||
print("测试4: 边界情况 - overlap >= limit")
|
||
text = "测试文本" * 100
|
||
result = handler.smart_split_transcription(text, limit=200, overlap=200)
|
||
print(f"输入长度: {len(text)}, limit=200, overlap=200")
|
||
print(f"输出块数: {len(result)}")
|
||
for i, chunk in enumerate(result):
|
||
print(f"块{i+1}长度: {len(chunk)}")
|
||
print()
|
||
|
||
print("测试5: 包含换行符的文本")
|
||
text_with_newlines = "第一行\n第二行\n第三行\n" * 100
|
||
result = handler.smart_split_transcription(text_with_newlines, limit=500, overlap=50)
|
||
print(f"输入长度: {len(text_with_newlines)}, 输出块数: {len(result)}")
|
||
print(f"第一块是否保留换行符: {'\\n' in result[0]}")
|
||
print()
|
||
|
||
print("测试6: 极端情况 - 空文本")
|
||
result = handler.smart_split_transcription("", limit=1000, overlap=100)
|
||
print(f"空文本输入,输出: {result}")
|
||
print()
|
||
|
||
print("测试7: 极端情况 - 单个字符")
|
||
result = handler.smart_split_transcription("A", limit=1000, overlap=100)
|
||
print(f"单字符输入,输出: {result}")
|
||
|
||
if __name__ == "__main__":
|
||
test_smart_split() |