修复分段逻辑
This commit is contained in:
parent
653ee4af13
commit
545402617b
@ -334,49 +334,35 @@ class MediaSplitHandle(BaseSplitHandle):
|
||||
return f"{minutes:02d}:{secs:02d}"
|
||||
|
||||
def smart_split_transcription(self, text, limit=1000, overlap=100):
|
||||
"""智能分割转录文本"""
|
||||
"""智能分割转录文本 - 简化版本"""
|
||||
# 处理边界情况
|
||||
if len(text) <= limit:
|
||||
return [text]
|
||||
|
||||
# 预处理:移除过多的空行
|
||||
text = '\n'.join(line for line in text.split('\n') if line.strip())
|
||||
# 确保参数合理性
|
||||
if overlap >= limit:
|
||||
overlap = limit // 2
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
step_size = limit - overlap # 每次前进的距离
|
||||
|
||||
while start < len(text):
|
||||
# 计算当前块的结束位置
|
||||
end = min(start + limit, len(text))
|
||||
|
||||
# 如果不是最后一块,尝试在句号处断开
|
||||
if end < len(text):
|
||||
# 寻找最佳断点
|
||||
best_end = end
|
||||
|
||||
# 优先在句号处断开
|
||||
last_period = text.rfind('。', start, end)
|
||||
if last_period > start + limit // 2:
|
||||
best_end = last_period + 1
|
||||
else:
|
||||
# 其次在换行符处断开
|
||||
last_newline = text.rfind('\n', start, end)
|
||||
if last_newline > start + limit // 3:
|
||||
best_end = last_newline + 1
|
||||
|
||||
end = best_end
|
||||
|
||||
# 提取当前块
|
||||
chunk = text[start:end].strip()
|
||||
if chunk:
|
||||
# 提取当前块(不修改格式)
|
||||
chunk = text[start:end]
|
||||
chunks.append(chunk)
|
||||
|
||||
# 计算下一块的开始位置(考虑重叠)
|
||||
start = max(end - overlap, len(chunks[-1]) if chunks else 0)
|
||||
|
||||
# 避免无限循环
|
||||
if start >= len(text):
|
||||
# 计算下一块的开始位置
|
||||
# 如果到了末尾,就退出
|
||||
if end >= len(text):
|
||||
break
|
||||
|
||||
# 前进step_size,确保总是有进展
|
||||
start += step_size
|
||||
|
||||
return chunks
|
||||
|
||||
def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False):
|
||||
|
||||
77
test_smart_split.py
Normal file
77
test_smart_split.py
Normal file
@ -0,0 +1,77 @@
|
||||
#!/usr/bin/env python
|
||||
"""测试优化后的 smart_split_transcription 函数"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
from apps.common.handle.impl.media.media_split_handle import MediaSplitHandle
|
||||
|
||||
def test_smart_split():
|
||||
handler = MediaSplitHandle()
|
||||
|
||||
print("测试1: 短文本(≤ limit)")
|
||||
short_text = "这是一个短文本"
|
||||
result = handler.smart_split_transcription(short_text, limit=1000, overlap=100)
|
||||
print(f"输入长度: {len(short_text)}, 输出块数: {len(result)}")
|
||||
print(f"结果: {result}")
|
||||
print()
|
||||
|
||||
print("测试2: 长文本(> limit)")
|
||||
long_text = "这是一段很长的文本。" * 200 # 约1200字符
|
||||
result = handler.smart_split_transcription(long_text, limit=1000, overlap=100)
|
||||
print(f"输入长度: {len(long_text)}, 输出块数: {len(result)}")
|
||||
print(f"第一块长度: {len(result[0])}")
|
||||
print(f"最后一块长度: {len(result[-1])}")
|
||||
print()
|
||||
|
||||
print("测试3: 验证重叠")
|
||||
# 创建有明显标识的文本
|
||||
base = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
long_text = base * 50 # 1300字符
|
||||
result = handler.smart_split_transcription(long_text, limit=500, overlap=100)
|
||||
print(f"输入长度: {len(long_text)}, 输出块数: {len(result)}")
|
||||
|
||||
# 检查重叠
|
||||
if len(result) > 1:
|
||||
end_of_first = result[0][-50:]
|
||||
start_of_second = result[1][:50]
|
||||
print(f"\n第一块末尾50字符: {end_of_first}")
|
||||
print(f"第二块开头50字符: {start_of_second}")
|
||||
|
||||
# 找出重叠部分
|
||||
overlap_len = 0
|
||||
max_possible = min(len(end_of_first), len(start_of_second))
|
||||
for i in range(max_possible):
|
||||
if end_of_first[-(i+1):] == start_of_second[:i+1]:
|
||||
overlap_len = i + 1
|
||||
print(f"检测到的重叠长度: {overlap_len}")
|
||||
print()
|
||||
|
||||
print("测试4: 边界情况 - overlap >= limit")
|
||||
text = "测试文本" * 100
|
||||
result = handler.smart_split_transcription(text, limit=200, overlap=200)
|
||||
print(f"输入长度: {len(text)}, limit=200, overlap=200")
|
||||
print(f"输出块数: {len(result)}")
|
||||
for i, chunk in enumerate(result):
|
||||
print(f"块{i+1}长度: {len(chunk)}")
|
||||
print()
|
||||
|
||||
print("测试5: 包含换行符的文本")
|
||||
text_with_newlines = "第一行\n第二行\n第三行\n" * 100
|
||||
result = handler.smart_split_transcription(text_with_newlines, limit=500, overlap=50)
|
||||
print(f"输入长度: {len(text_with_newlines)}, 输出块数: {len(result)}")
|
||||
print(f"第一块是否保留换行符: {'\\n' in result[0]}")
|
||||
print()
|
||||
|
||||
print("测试6: 极端情况 - 空文本")
|
||||
result = handler.smart_split_transcription("", limit=1000, overlap=100)
|
||||
print(f"空文本输入,输出: {result}")
|
||||
print()
|
||||
|
||||
print("测试7: 极端情况 - 单个字符")
|
||||
result = handler.smart_split_transcription("A", limit=1000, overlap=100)
|
||||
print(f"单字符输入,输出: {result}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_smart_split()
|
||||
Loading…
Reference in New Issue
Block a user