add logs
This commit is contained in:
parent
653ee4af13
commit
c80eebd6c0
@ -2,6 +2,7 @@
|
|||||||
"""
|
"""
|
||||||
音视频处理器 - MaxKB集成层
|
音视频处理器 - MaxKB集成层
|
||||||
"""
|
"""
|
||||||
|
import traceback
|
||||||
from typing import List
|
from typing import List
|
||||||
from common.handle.base_split_handle import BaseSplitHandle
|
from common.handle.base_split_handle import BaseSplitHandle
|
||||||
from common.utils.logger import maxkb_logger
|
from common.utils.logger import maxkb_logger
|
||||||
@ -335,18 +336,30 @@ class MediaSplitHandle(BaseSplitHandle):
|
|||||||
|
|
||||||
def smart_split_transcription(self, text, limit=1000, overlap=100):
|
def smart_split_transcription(self, text, limit=1000, overlap=100):
|
||||||
"""智能分割转录文本"""
|
"""智能分割转录文本"""
|
||||||
|
maxkb_logger.info(f"🔧 smart_split_transcription called - text length: {len(text)}, limit: {limit}, overlap: {overlap}")
|
||||||
|
|
||||||
if len(text) <= limit:
|
if len(text) <= limit:
|
||||||
|
maxkb_logger.info(f"✅ Text length {len(text)} <= limit {limit}, no splitting needed")
|
||||||
return [text]
|
return [text]
|
||||||
|
|
||||||
# 预处理:移除过多的空行
|
# 预处理:移除过多的空行
|
||||||
|
original_length = len(text)
|
||||||
text = '\n'.join(line for line in text.split('\n') if line.strip())
|
text = '\n'.join(line for line in text.split('\n') if line.strip())
|
||||||
|
cleaned_length = len(text)
|
||||||
|
maxkb_logger.info(f"🧹 Text preprocessing: {original_length} -> {cleaned_length} characters")
|
||||||
|
|
||||||
chunks = []
|
chunks = []
|
||||||
start = 0
|
start = 0
|
||||||
|
iteration = 0
|
||||||
|
max_iterations = len(text) // (limit // 2) + 10 # 防止无限循环
|
||||||
|
|
||||||
|
while start < len(text) and iteration < max_iterations:
|
||||||
|
iteration += 1
|
||||||
|
maxkb_logger.info(f"🔄 Iteration {iteration}: start={start}, text_length={len(text)}")
|
||||||
|
|
||||||
while start < len(text):
|
|
||||||
# 计算当前块的结束位置
|
# 计算当前块的结束位置
|
||||||
end = min(start + limit, len(text))
|
end = min(start + limit, len(text))
|
||||||
|
maxkb_logger.info(f"📍 Initial end position: {end}")
|
||||||
|
|
||||||
# 如果不是最后一块,尝试在句号处断开
|
# 如果不是最后一块,尝试在句号处断开
|
||||||
if end < len(text):
|
if end < len(text):
|
||||||
@ -355,39 +368,68 @@ class MediaSplitHandle(BaseSplitHandle):
|
|||||||
|
|
||||||
# 优先在句号处断开
|
# 优先在句号处断开
|
||||||
last_period = text.rfind('。', start, end)
|
last_period = text.rfind('。', start, end)
|
||||||
|
maxkb_logger.info(f"🔍 Last period position: {last_period}")
|
||||||
|
|
||||||
if last_period > start + limit // 2:
|
if last_period > start + limit // 2:
|
||||||
best_end = last_period + 1
|
best_end = last_period + 1
|
||||||
|
maxkb_logger.info(f"✂️ Splitting at period: position {best_end}")
|
||||||
else:
|
else:
|
||||||
# 其次在换行符处断开
|
# 其次在换行符处断开
|
||||||
last_newline = text.rfind('\n', start, end)
|
last_newline = text.rfind('\n', start, end)
|
||||||
|
maxkb_logger.info(f"🔍 Last newline position: {last_newline}")
|
||||||
|
|
||||||
if last_newline > start + limit // 3:
|
if last_newline > start + limit // 3:
|
||||||
best_end = last_newline + 1
|
best_end = last_newline + 1
|
||||||
|
maxkb_logger.info(f"✂️ Splitting at newline: position {best_end}")
|
||||||
|
else:
|
||||||
|
maxkb_logger.info(f"⚠️ No good split point found, using default position {end}")
|
||||||
|
|
||||||
end = best_end
|
end = best_end
|
||||||
|
|
||||||
# 提取当前块
|
# 提取当前块
|
||||||
chunk = text[start:end].strip()
|
chunk = text[start:end].strip()
|
||||||
|
chunk_length = len(chunk)
|
||||||
|
maxkb_logger.info(f"📦 Chunk {len(chunks)+1}: length={chunk_length}, start={start}, end={end}")
|
||||||
|
maxkb_logger.info(f"📝 Chunk preview: '{chunk[:100]}...'")
|
||||||
|
|
||||||
if chunk:
|
if chunk:
|
||||||
chunks.append(chunk)
|
chunks.append(chunk)
|
||||||
|
else:
|
||||||
|
maxkb_logger.warning(f"⚠️ Empty chunk created, skipping")
|
||||||
|
|
||||||
# 计算下一块的开始位置(考虑重叠)
|
# 计算下一块的开始位置(考虑重叠)
|
||||||
start = max(end - overlap, len(chunks[-1]) if chunks else 0)
|
# 修复:确保start总是增加的
|
||||||
|
old_start = start
|
||||||
|
start = max(end - overlap, end - chunk_length // 2)
|
||||||
|
|
||||||
|
# 确保至少前进1个字符
|
||||||
|
if start <= old_start:
|
||||||
|
start = old_start + 1
|
||||||
|
maxkb_logger.warning(f"⚠️ Adjusting start position to prevent infinite loop: {start}")
|
||||||
|
|
||||||
|
maxkb_logger.info(f"➡️ Next start position: {start}")
|
||||||
|
|
||||||
# 避免无限循环
|
# 避免无限循环
|
||||||
if start >= len(text):
|
if start >= len(text):
|
||||||
|
maxkb_logger.info(f"🏁 Reached end of text")
|
||||||
break
|
break
|
||||||
|
|
||||||
|
if iteration >= max_iterations:
|
||||||
|
maxkb_logger.error(f"❌ Max iterations ({max_iterations}) reached, potential infinite loop")
|
||||||
|
|
||||||
|
maxkb_logger.info(f"✅ Split completed: {len(chunks)} chunks created in {iteration} iterations")
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False):
|
def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False):
|
||||||
"""应用智能分块到转录结果"""
|
"""应用智能分块到转录结果"""
|
||||||
overlap = 100 # 前后重叠字符数
|
try:
|
||||||
|
overlap = 100 # 前后重叠字符数
|
||||||
|
|
||||||
maxkb_logger.info(f"🔧 Starting smart split process - limit: {limit}, with_filter: {with_filter}")
|
maxkb_logger.info(f"🔧 Starting smart split process - limit: {limit}, with_filter: {with_filter}")
|
||||||
original_paragraphs = result.get('content', [])
|
original_paragraphs = result.get('content', [])
|
||||||
maxkb_logger.info(f"📊 Original paragraphs count: {len(original_paragraphs)}")
|
maxkb_logger.info(f"📊 Original paragraphs count: {len(original_paragraphs)}")
|
||||||
|
|
||||||
new_paragraphs = []
|
new_paragraphs = []
|
||||||
total_chunks_created = 0
|
total_chunks_created = 0
|
||||||
|
|
||||||
for idx, paragraph in enumerate(original_paragraphs):
|
for idx, paragraph in enumerate(original_paragraphs):
|
||||||
@ -408,15 +450,17 @@ class MediaSplitHandle(BaseSplitHandle):
|
|||||||
maxkb_logger.info(f"✂️ Paragraph {idx+1} needs splitting (length {content_length} > limit {limit})")
|
maxkb_logger.info(f"✂️ Paragraph {idx+1} needs splitting (length {content_length} > limit {limit})")
|
||||||
|
|
||||||
# 应用智能分块
|
# 应用智能分块
|
||||||
chunks = self.smart_split_transcription(content, limit, overlap)
|
try:
|
||||||
maxkb_logger.info(f"✂️ Split paragraph {idx+1} into {len(chunks)} chunks")
|
maxkb_logger.info(f"🔄 Calling smart_split_transcription for paragraph {idx+1}")
|
||||||
|
chunks = self.smart_split_transcription(content, limit, overlap)
|
||||||
|
maxkb_logger.info(f"✂️ Split paragraph {idx+1} into {len(chunks)} chunks")
|
||||||
|
|
||||||
# 记录每个chunk的详细信息
|
# 记录每个chunk的详细信息
|
||||||
for c_idx, chunk in enumerate(chunks):
|
for c_idx, chunk in enumerate(chunks):
|
||||||
maxkb_logger.info(f"📦 Chunk {c_idx+1}/{len(chunks)}: length={len(chunk)}, preview='{chunk[:50]}...'")
|
maxkb_logger.info(f"📦 Chunk {c_idx+1}/{len(chunks)}: length={len(chunk)}, preview='{chunk[:50]}...'")
|
||||||
|
|
||||||
# 创建新的段落
|
# 创建新的段落
|
||||||
for c_idx, chunk in enumerate(chunks):
|
for c_idx, chunk in enumerate(chunks):
|
||||||
# 保留原始元数据,但更新分段相关信息
|
# 保留原始元数据,但更新分段相关信息
|
||||||
metadata = paragraph.get('metadata', {}).copy()
|
metadata = paragraph.get('metadata', {}).copy()
|
||||||
metadata.update({
|
metadata.update({
|
||||||
@ -437,6 +481,14 @@ class MediaSplitHandle(BaseSplitHandle):
|
|||||||
}
|
}
|
||||||
new_paragraphs.append(new_paragraph)
|
new_paragraphs.append(new_paragraph)
|
||||||
total_chunks_created += 1
|
total_chunks_created += 1
|
||||||
|
|
||||||
|
except Exception as split_error:
|
||||||
|
maxkb_logger.error(f"❌ Error splitting paragraph {idx+1}: {str(split_error)}")
|
||||||
|
maxkb_logger.error(f"🔍 Split error traceback: {traceback.format_exc()}")
|
||||||
|
# 如果分块失败,保留原始段落
|
||||||
|
maxkb_logger.info(f"🔄 Keeping original paragraph due to split error")
|
||||||
|
new_paragraphs.append(paragraph)
|
||||||
|
total_chunks_created += 1
|
||||||
else:
|
else:
|
||||||
maxkb_logger.info(f"📄 Paragraph {idx+1} does not need splitting (length {content_length} <= limit {limit})")
|
maxkb_logger.info(f"📄 Paragraph {idx+1} does not need splitting (length {content_length} <= limit {limit})")
|
||||||
new_paragraphs.append(paragraph)
|
new_paragraphs.append(paragraph)
|
||||||
@ -462,9 +514,16 @@ class MediaSplitHandle(BaseSplitHandle):
|
|||||||
result['metadata'] = metadata
|
result['metadata'] = metadata
|
||||||
|
|
||||||
maxkb_logger.info(f"✅ Smart split completed - original: {len(original_paragraphs)} paragraphs, final: {len(new_paragraphs)} chunks")
|
maxkb_logger.info(f"✅ Smart split completed - original: {len(original_paragraphs)} paragraphs, final: {len(new_paragraphs)} chunks")
|
||||||
maxkb_logger.info(f"📈 Total chunks created: {total_chunks_created}")
|
maxkb_logger.info(f"📈 Total chunks created: {total_chunks_created}")
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
maxkb_logger.error(f"❌ Fatal error in _apply_smart_split: {str(e)}")
|
||||||
|
maxkb_logger.error(f"🔍 Full error traceback: {traceback.format_exc()}")
|
||||||
|
# 返回原始结果,不做处理
|
||||||
|
maxkb_logger.info(f"🔄 Returning original result without splitting")
|
||||||
|
return result
|
||||||
|
|
||||||
def _clean_text(self, text):
|
def _clean_text(self, text):
|
||||||
"""清理文本:去掉重复多余符号空格、空行、制表符"""
|
"""清理文本:去掉重复多余符号空格、空行、制表符"""
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user