From c80eebd6c0e6fe80d53c3f3c76f15ad25fec083a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= Date: Fri, 19 Dec 2025 11:24:36 +0800 Subject: [PATCH] add logs --- .../handle/impl/media/media_split_handle.py | 89 +++++++++++++++---- 1 file changed, 74 insertions(+), 15 deletions(-) diff --git a/apps/common/handle/impl/media/media_split_handle.py b/apps/common/handle/impl/media/media_split_handle.py index d1432205..d584429f 100644 --- a/apps/common/handle/impl/media/media_split_handle.py +++ b/apps/common/handle/impl/media/media_split_handle.py @@ -2,6 +2,7 @@ """ 音视频处理器 - MaxKB集成层 """ +import traceback from typing import List from common.handle.base_split_handle import BaseSplitHandle from common.utils.logger import maxkb_logger @@ -335,18 +336,30 @@ class MediaSplitHandle(BaseSplitHandle): def smart_split_transcription(self, text, limit=1000, overlap=100): """智能分割转录文本""" + maxkb_logger.info(f"🔧 smart_split_transcription called - text length: {len(text)}, limit: {limit}, overlap: {overlap}") + if len(text) <= limit: + maxkb_logger.info(f"✅ Text length {len(text)} <= limit {limit}, no splitting needed") return [text] # 预处理:移除过多的空行 + original_length = len(text) text = '\n'.join(line for line in text.split('\n') if line.strip()) + cleaned_length = len(text) + maxkb_logger.info(f"🧹 Text preprocessing: {original_length} -> {cleaned_length} characters") chunks = [] start = 0 + iteration = 0 + max_iterations = len(text) // (limit // 2) + 10 # 防止无限循环 + + while start < len(text) and iteration < max_iterations: + iteration += 1 + maxkb_logger.info(f"🔄 Iteration {iteration}: start={start}, text_length={len(text)}") - while start < len(text): # 计算当前块的结束位置 end = min(start + limit, len(text)) + maxkb_logger.info(f"📍 Initial end position: {end}") # 如果不是最后一块,尝试在句号处断开 if end < len(text): @@ -355,39 +368,68 @@ class MediaSplitHandle(BaseSplitHandle): # 优先在句号处断开 last_period = text.rfind('。', start, end) + maxkb_logger.info(f"🔍 Last period position: {last_period}") + if last_period > start + limit // 2: best_end = last_period + 1 + maxkb_logger.info(f"✂️ Splitting at period: position {best_end}") else: # 其次在换行符处断开 last_newline = text.rfind('\n', start, end) + maxkb_logger.info(f"🔍 Last newline position: {last_newline}") + if last_newline > start + limit // 3: best_end = last_newline + 1 + maxkb_logger.info(f"✂️ Splitting at newline: position {best_end}") + else: + maxkb_logger.info(f"⚠️ No good split point found, using default position {end}") end = best_end # 提取当前块 chunk = text[start:end].strip() + chunk_length = len(chunk) + maxkb_logger.info(f"📦 Chunk {len(chunks)+1}: length={chunk_length}, start={start}, end={end}") + maxkb_logger.info(f"📝 Chunk preview: '{chunk[:100]}...'") + if chunk: chunks.append(chunk) + else: + maxkb_logger.warning(f"⚠️ Empty chunk created, skipping") # 计算下一块的开始位置(考虑重叠) - start = max(end - overlap, len(chunks[-1]) if chunks else 0) + # 修复:确保start总是增加的 + old_start = start + start = max(end - overlap, end - chunk_length // 2) + + # 确保至少前进1个字符 + if start <= old_start: + start = old_start + 1 + maxkb_logger.warning(f"⚠️ Adjusting start position to prevent infinite loop: {start}") + + maxkb_logger.info(f"➡️ Next start position: {start}") # 避免无限循环 if start >= len(text): + maxkb_logger.info(f"🏁 Reached end of text") break + if iteration >= max_iterations: + maxkb_logger.error(f"❌ Max iterations ({max_iterations}) reached, potential infinite loop") + + maxkb_logger.info(f"✅ Split completed: {len(chunks)} chunks created in {iteration} iterations") return chunks def _apply_smart_split(self, result: dict, limit: int = 1000, with_filter: bool = False): """应用智能分块到转录结果""" - overlap = 100 # 前后重叠字符数 + try: + overlap = 100 # 前后重叠字符数 - maxkb_logger.info(f"🔧 Starting smart split process - limit: {limit}, with_filter: {with_filter}") + maxkb_logger.info(f"🔧 Starting smart split process - limit: {limit}, with_filter: {with_filter}") original_paragraphs = result.get('content', []) - maxkb_logger.info(f"📊 Original paragraphs count: {len(original_paragraphs)}") + maxkb_logger.info(f"📊 Original paragraphs count: {len(original_paragraphs)}") - new_paragraphs = [] + new_paragraphs = [] total_chunks_created = 0 for idx, paragraph in enumerate(original_paragraphs): @@ -408,15 +450,17 @@ class MediaSplitHandle(BaseSplitHandle): maxkb_logger.info(f"✂️ Paragraph {idx+1} needs splitting (length {content_length} > limit {limit})") # 应用智能分块 - chunks = self.smart_split_transcription(content, limit, overlap) - maxkb_logger.info(f"✂️ Split paragraph {idx+1} into {len(chunks)} chunks") + try: + maxkb_logger.info(f"🔄 Calling smart_split_transcription for paragraph {idx+1}") + chunks = self.smart_split_transcription(content, limit, overlap) + maxkb_logger.info(f"✂️ Split paragraph {idx+1} into {len(chunks)} chunks") - # 记录每个chunk的详细信息 - for c_idx, chunk in enumerate(chunks): - maxkb_logger.info(f"📦 Chunk {c_idx+1}/{len(chunks)}: length={len(chunk)}, preview='{chunk[:50]}...'") + # 记录每个chunk的详细信息 + for c_idx, chunk in enumerate(chunks): + maxkb_logger.info(f"📦 Chunk {c_idx+1}/{len(chunks)}: length={len(chunk)}, preview='{chunk[:50]}...'") - # 创建新的段落 - for c_idx, chunk in enumerate(chunks): + # 创建新的段落 + for c_idx, chunk in enumerate(chunks): # 保留原始元数据,但更新分段相关信息 metadata = paragraph.get('metadata', {}).copy() metadata.update({ @@ -437,6 +481,14 @@ class MediaSplitHandle(BaseSplitHandle): } new_paragraphs.append(new_paragraph) total_chunks_created += 1 + + except Exception as split_error: + maxkb_logger.error(f"❌ Error splitting paragraph {idx+1}: {str(split_error)}") + maxkb_logger.error(f"🔍 Split error traceback: {traceback.format_exc()}") + # 如果分块失败,保留原始段落 + maxkb_logger.info(f"🔄 Keeping original paragraph due to split error") + new_paragraphs.append(paragraph) + total_chunks_created += 1 else: maxkb_logger.info(f"📄 Paragraph {idx+1} does not need splitting (length {content_length} <= limit {limit})") new_paragraphs.append(paragraph) @@ -462,9 +514,16 @@ class MediaSplitHandle(BaseSplitHandle): result['metadata'] = metadata maxkb_logger.info(f"✅ Smart split completed - original: {len(original_paragraphs)} paragraphs, final: {len(new_paragraphs)} chunks") - maxkb_logger.info(f"📈 Total chunks created: {total_chunks_created}") + maxkb_logger.info(f"📈 Total chunks created: {total_chunks_created}") - return result + return result + + except Exception as e: + maxkb_logger.error(f"❌ Fatal error in _apply_smart_split: {str(e)}") + maxkb_logger.error(f"🔍 Full error traceback: {traceback.format_exc()}") + # 返回原始结果,不做处理 + maxkb_logger.info(f"🔄 Returning original result without splitting") + return result def _clean_text(self, text): """清理文本:去掉重复多余符号空格、空行、制表符"""