修复分段逻辑
Some checks are pending
sync2gitee / repo-sync (push) Waiting to run
Typos Check / Spell Check with Typos (push) Waiting to run

This commit is contained in:
朱潮 2025-12-19 12:34:22 +08:00
parent 653ee4af13
commit 545402617b
2 changed files with 92 additions and 29 deletions

View File

@ -334,49 +334,35 @@ class MediaSplitHandle(BaseSplitHandle):
return f"{minutes:02d}:{secs:02d}"
def smart_split_transcription(self, text, limit=1000, overlap=100):
"""智能分割转录文本"""
"""智能分割转录文本 - 简化版本"""
# 处理边界情况
if len(text) <= limit:
return [text]
# 预处理:移除过多的空行
text = '\n'.join(line for line in text.split('\n') if line.strip())
# 确保参数合理性
if overlap >= limit:
overlap = limit // 2
chunks = []
start = 0
step_size = limit - overlap # 每次前进的距离
while start < len(text):
# 计算当前块的结束位置
end = min(start + limit, len(text))
# 如果不是最后一块,尝试在句号处断开
if end < len(text):
# 寻找最佳断点
best_end = end
# 优先在句号处断开
last_period = text.rfind('', start, end)
if last_period > start + limit // 2:
best_end = last_period + 1
else:
# 其次在换行符处断开
last_newline = text.rfind('\n', start, end)
if last_newline > start + limit // 3:
best_end = last_newline + 1
end = best_end
# 提取当前块
chunk = text[start:end].strip()
if chunk:
# 提取当前块(不修改格式)
chunk = text[start:end]
chunks.append(chunk)
# 计算下一块的开始位置(考虑重叠)
start = max(end - overlap, len(chunks[-1]) if chunks else 0)
# 避免无限循环
if start >= len(text):
# 计算下一块的开始位置
# 如果到了末尾,就退出
if end >= len(text):
break
# 前进step_size确保总是有进展
start += step_size
return chunks
def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False):

77
test_smart_split.py Normal file
View File

@ -0,0 +1,77 @@
#!/usr/bin/env python
"""测试优化后的 smart_split_transcription 函数"""
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from apps.common.handle.impl.media.media_split_handle import MediaSplitHandle
def test_smart_split():
handler = MediaSplitHandle()
print("测试1: 短文本(≤ limit")
short_text = "这是一个短文本"
result = handler.smart_split_transcription(short_text, limit=1000, overlap=100)
print(f"输入长度: {len(short_text)}, 输出块数: {len(result)}")
print(f"结果: {result}")
print()
print("测试2: 长文本(> limit")
long_text = "这是一段很长的文本。" * 200 # 约1200字符
result = handler.smart_split_transcription(long_text, limit=1000, overlap=100)
print(f"输入长度: {len(long_text)}, 输出块数: {len(result)}")
print(f"第一块长度: {len(result[0])}")
print(f"最后一块长度: {len(result[-1])}")
print()
print("测试3: 验证重叠")
# 创建有明显标识的文本
base = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
long_text = base * 50 # 1300字符
result = handler.smart_split_transcription(long_text, limit=500, overlap=100)
print(f"输入长度: {len(long_text)}, 输出块数: {len(result)}")
# 检查重叠
if len(result) > 1:
end_of_first = result[0][-50:]
start_of_second = result[1][:50]
print(f"\n第一块末尾50字符: {end_of_first}")
print(f"第二块开头50字符: {start_of_second}")
# 找出重叠部分
overlap_len = 0
max_possible = min(len(end_of_first), len(start_of_second))
for i in range(max_possible):
if end_of_first[-(i+1):] == start_of_second[:i+1]:
overlap_len = i + 1
print(f"检测到的重叠长度: {overlap_len}")
print()
print("测试4: 边界情况 - overlap >= limit")
text = "测试文本" * 100
result = handler.smart_split_transcription(text, limit=200, overlap=200)
print(f"输入长度: {len(text)}, limit=200, overlap=200")
print(f"输出块数: {len(result)}")
for i, chunk in enumerate(result):
print(f"{i+1}长度: {len(chunk)}")
print()
print("测试5: 包含换行符的文本")
text_with_newlines = "第一行\n第二行\n第三行\n" * 100
result = handler.smart_split_transcription(text_with_newlines, limit=500, overlap=50)
print(f"输入长度: {len(text_with_newlines)}, 输出块数: {len(result)}")
print(f"第一块是否保留换行符: {'\\n' in result[0]}")
print()
print("测试6: 极端情况 - 空文本")
result = handler.smart_split_transcription("", limit=1000, overlap=100)
print(f"空文本输入,输出: {result}")
print()
print("测试7: 极端情况 - 单个字符")
result = handler.smart_split_transcription("A", limit=1000, overlap=100)
print(f"单字符输入,输出: {result}")
if __name__ == "__main__":
test_smart_split()