add logs
Some checks are pending
sync2gitee / repo-sync (push) Waiting to run
Typos Check / Spell Check with Typos (push) Waiting to run

This commit is contained in:
朱潮 2025-12-19 11:24:36 +08:00
parent 653ee4af13
commit c80eebd6c0

View File

@ -2,6 +2,7 @@
"""
音视频处理器 - MaxKB集成层
"""
import traceback
from typing import List
from common.handle.base_split_handle import BaseSplitHandle
from common.utils.logger import maxkb_logger
@ -335,18 +336,30 @@ class MediaSplitHandle(BaseSplitHandle):
def smart_split_transcription(self, text, limit=1000, overlap=100):
"""智能分割转录文本"""
maxkb_logger.info(f"🔧 smart_split_transcription called - text length: {len(text)}, limit: {limit}, overlap: {overlap}")
if len(text) <= limit:
maxkb_logger.info(f"✅ Text length {len(text)} <= limit {limit}, no splitting needed")
return [text]
# 预处理:移除过多的空行
original_length = len(text)
text = '\n'.join(line for line in text.split('\n') if line.strip())
cleaned_length = len(text)
maxkb_logger.info(f"🧹 Text preprocessing: {original_length} -> {cleaned_length} characters")
chunks = []
start = 0
iteration = 0
max_iterations = len(text) // (limit // 2) + 10 # 防止无限循环
while start < len(text) and iteration < max_iterations:
iteration += 1
maxkb_logger.info(f"🔄 Iteration {iteration}: start={start}, text_length={len(text)}")
while start < len(text):
# 计算当前块的结束位置
end = min(start + limit, len(text))
maxkb_logger.info(f"📍 Initial end position: {end}")
# 如果不是最后一块,尝试在句号处断开
if end < len(text):
@ -355,32 +368,61 @@ class MediaSplitHandle(BaseSplitHandle):
# 优先在句号处断开
last_period = text.rfind('', start, end)
maxkb_logger.info(f"🔍 Last period position: {last_period}")
if last_period > start + limit // 2:
best_end = last_period + 1
maxkb_logger.info(f"✂️ Splitting at period: position {best_end}")
else:
# 其次在换行符处断开
last_newline = text.rfind('\n', start, end)
maxkb_logger.info(f"🔍 Last newline position: {last_newline}")
if last_newline > start + limit // 3:
best_end = last_newline + 1
maxkb_logger.info(f"✂️ Splitting at newline: position {best_end}")
else:
maxkb_logger.info(f"⚠️ No good split point found, using default position {end}")
end = best_end
# 提取当前块
chunk = text[start:end].strip()
chunk_length = len(chunk)
maxkb_logger.info(f"📦 Chunk {len(chunks)+1}: length={chunk_length}, start={start}, end={end}")
maxkb_logger.info(f"📝 Chunk preview: '{chunk[:100]}...'")
if chunk:
chunks.append(chunk)
else:
maxkb_logger.warning(f"⚠️ Empty chunk created, skipping")
# 计算下一块的开始位置(考虑重叠)
start = max(end - overlap, len(chunks[-1]) if chunks else 0)
# 修复确保start总是增加的
old_start = start
start = max(end - overlap, end - chunk_length // 2)
# 确保至少前进1个字符
if start <= old_start:
start = old_start + 1
maxkb_logger.warning(f"⚠️ Adjusting start position to prevent infinite loop: {start}")
maxkb_logger.info(f"➡️ Next start position: {start}")
# 避免无限循环
if start >= len(text):
maxkb_logger.info(f"🏁 Reached end of text")
break
if iteration >= max_iterations:
maxkb_logger.error(f"❌ Max iterations ({max_iterations}) reached, potential infinite loop")
maxkb_logger.info(f"✅ Split completed: {len(chunks)} chunks created in {iteration} iterations")
return chunks
def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False):
"""应用智能分块到转录结果"""
try:
overlap = 100 # 前后重叠字符数
maxkb_logger.info(f"🔧 Starting smart split process - limit: {limit}, with_filter: {with_filter}")
@ -408,6 +450,8 @@ class MediaSplitHandle(BaseSplitHandle):
maxkb_logger.info(f"✂️ Paragraph {idx+1} needs splitting (length {content_length} > limit {limit})")
# 应用智能分块
try:
maxkb_logger.info(f"🔄 Calling smart_split_transcription for paragraph {idx+1}")
chunks = self.smart_split_transcription(content, limit, overlap)
maxkb_logger.info(f"✂️ Split paragraph {idx+1} into {len(chunks)} chunks")
@ -437,6 +481,14 @@ class MediaSplitHandle(BaseSplitHandle):
}
new_paragraphs.append(new_paragraph)
total_chunks_created += 1
except Exception as split_error:
maxkb_logger.error(f"❌ Error splitting paragraph {idx+1}: {str(split_error)}")
maxkb_logger.error(f"🔍 Split error traceback: {traceback.format_exc()}")
# 如果分块失败,保留原始段落
maxkb_logger.info(f"🔄 Keeping original paragraph due to split error")
new_paragraphs.append(paragraph)
total_chunks_created += 1
else:
maxkb_logger.info(f"📄 Paragraph {idx+1} does not need splitting (length {content_length} <= limit {limit})")
new_paragraphs.append(paragraph)
@ -466,6 +518,13 @@ class MediaSplitHandle(BaseSplitHandle):
return result
except Exception as e:
maxkb_logger.error(f"❌ Fatal error in _apply_smart_split: {str(e)}")
maxkb_logger.error(f"🔍 Full error traceback: {traceback.format_exc()}")
# 返回原始结果,不做处理
maxkb_logger.info(f"🔄 Returning original result without splitting")
return result
def _clean_text(self, text):
"""清理文本:去掉重复多余符号空格、空行、制表符"""
import re