add logs
This commit is contained in:
parent
3bbdbca861
commit
653ee4af13
@ -126,17 +126,35 @@ class ListenerManagement:
|
|||||||
@param paragraph_id: 段落id
|
@param paragraph_id: 段落id
|
||||||
@param embedding_model: 向量模型
|
@param embedding_model: 向量模型
|
||||||
"""
|
"""
|
||||||
maxkb_logger.info(_('Start--->Embedding paragraph: {paragraph_id}').format(paragraph_id=paragraph_id))
|
maxkb_logger.info(f"🚀 Starting embedding for paragraph: {paragraph_id}")
|
||||||
|
|
||||||
|
# 获取段落信息
|
||||||
|
paragraph = QuerySet(Paragraph).filter(id=paragraph_id).first()
|
||||||
|
if paragraph:
|
||||||
|
content_length = len(paragraph.content)
|
||||||
|
maxkb_logger.info(f"📄 Paragraph info - title: '{paragraph.title}', content length: {content_length}")
|
||||||
|
maxkb_logger.info(f"📝 Content preview: '{paragraph.content[:100]}...'")
|
||||||
|
else:
|
||||||
|
maxkb_logger.error(f"❌ Paragraph {paragraph_id} not found in database")
|
||||||
|
return
|
||||||
|
|
||||||
# 更新到开始状态
|
# 更新到开始状态
|
||||||
|
maxkb_logger.info(f"🔄 Updating paragraph status to STARTED")
|
||||||
ListenerManagement.update_status(QuerySet(Paragraph).filter(id=paragraph_id), TaskType.EMBEDDING, State.STARTED)
|
ListenerManagement.update_status(QuerySet(Paragraph).filter(id=paragraph_id), TaskType.EMBEDDING, State.STARTED)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
maxkb_logger.info(f"🔍 Searching data for paragraph embedding")
|
||||||
data_list = native_search(
|
data_list = native_search(
|
||||||
{'problem': QuerySet(get_dynamics_model({'paragraph.id': django.db.models.CharField()})).filter(
|
{'problem': QuerySet(get_dynamics_model({'paragraph.id': django.db.models.CharField()})).filter(
|
||||||
**{'paragraph.id': paragraph_id}),
|
**{'paragraph.id': paragraph_id}),
|
||||||
'paragraph': QuerySet(Paragraph).filter(id=paragraph_id)},
|
'paragraph': QuerySet(Paragraph).filter(id=paragraph_id)},
|
||||||
select_string=get_file_content(
|
select_string=get_file_content(
|
||||||
os.path.join(PROJECT_DIR, "apps", "common", 'sql', 'list_embedding_text.sql')))
|
os.path.join(PROJECT_DIR, "apps", "common", 'sql', 'list_embedding_text.sql')))
|
||||||
# 删除段落
|
|
||||||
|
maxkb_logger.info(f"📊 Found {len(data_list) if data_list else 0} data items for embedding")
|
||||||
|
|
||||||
|
# 删除旧向量
|
||||||
|
maxkb_logger.info(f"🗑️ Deleting old embedding for paragraph {paragraph_id}")
|
||||||
VectorStore.get_embedding_vector().delete_by_paragraph_id(paragraph_id)
|
VectorStore.get_embedding_vector().delete_by_paragraph_id(paragraph_id)
|
||||||
|
|
||||||
def is_the_task_interrupted():
|
def is_the_task_interrupted():
|
||||||
@ -146,13 +164,17 @@ class ListenerManagement:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
# 批量向量化
|
# 批量向量化
|
||||||
|
maxkb_logger.info(f"💾 Saving embeddings to vector store")
|
||||||
VectorStore.get_embedding_vector().batch_save(data_list, embedding_model, is_the_task_interrupted)
|
VectorStore.get_embedding_vector().batch_save(data_list, embedding_model, is_the_task_interrupted)
|
||||||
# 更新到开始状态
|
|
||||||
|
# 更新到成功状态
|
||||||
|
maxkb_logger.info(f"✅ Updating paragraph status to SUCCESS")
|
||||||
ListenerManagement.update_status(QuerySet(Paragraph).filter(id=paragraph_id), TaskType.EMBEDDING,
|
ListenerManagement.update_status(QuerySet(Paragraph).filter(id=paragraph_id), TaskType.EMBEDDING,
|
||||||
State.SUCCESS)
|
State.SUCCESS)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
maxkb_logger.error(_('Vectorized paragraph: {paragraph_id} error {error} {traceback}').format(
|
maxkb_logger.error(f"❌ Failed to embed paragraph {paragraph_id}: {str(e)}")
|
||||||
paragraph_id=paragraph_id, error=str(e), traceback=traceback.format_exc()))
|
maxkb_logger.error(f"🔍 Full error traceback: {traceback.format_exc()}")
|
||||||
|
maxkb_logger.info(f"🔄 Updating paragraph status to FAILURE")
|
||||||
ListenerManagement.update_status(QuerySet(Paragraph).filter(id=paragraph_id), TaskType.EMBEDDING,
|
ListenerManagement.update_status(QuerySet(Paragraph).filter(id=paragraph_id), TaskType.EMBEDDING,
|
||||||
State.FAILURE)
|
State.FAILURE)
|
||||||
finally:
|
finally:
|
||||||
@ -166,10 +188,24 @@ class ListenerManagement:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def get_embedding_paragraph_apply(embedding_model, is_the_task_interrupted, post_apply=lambda: None):
|
def get_embedding_paragraph_apply(embedding_model, is_the_task_interrupted, post_apply=lambda: None):
|
||||||
def embedding_paragraph_apply(paragraph_list):
|
def embedding_paragraph_apply(paragraph_list):
|
||||||
for paragraph in paragraph_list:
|
maxkb_logger.info(f"🔍 Processing batch of {len(paragraph_list)} paragraphs for embedding")
|
||||||
|
|
||||||
|
for idx, paragraph in enumerate(paragraph_list):
|
||||||
if is_the_task_interrupted():
|
if is_the_task_interrupted():
|
||||||
|
maxkb_logger.warning(f"⚠️ Embedding task interrupted, stopping at paragraph {idx+1}/{len(paragraph_list)}")
|
||||||
break
|
break
|
||||||
ListenerManagement.embedding_by_paragraph(str(paragraph.get('id')), embedding_model)
|
|
||||||
|
paragraph_id = str(paragraph.get('id'))
|
||||||
|
maxkb_logger.info(f"📊 Embedding paragraph {idx+1}/{len(paragraph_list)}: ID={paragraph_id}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
ListenerManagement.embedding_by_paragraph(paragraph_id, embedding_model)
|
||||||
|
maxkb_logger.info(f"✅ Successfully embedded paragraph {paragraph_id}")
|
||||||
|
except Exception as e:
|
||||||
|
maxkb_logger.error(f"❌ Failed to embed paragraph {paragraph_id}: {str(e)}")
|
||||||
|
# 继续处理其他段落,不中断整个批次
|
||||||
|
|
||||||
|
maxkb_logger.info(f"🎉 Completed embedding batch processing")
|
||||||
post_apply()
|
post_apply()
|
||||||
|
|
||||||
return embedding_paragraph_apply
|
return embedding_paragraph_apply
|
||||||
@ -272,21 +308,32 @@ class ListenerManagement:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
if is_the_task_interrupted():
|
if is_the_task_interrupted():
|
||||||
|
maxkb_logger.warning(f"⚠️ Embedding task for document {document_id} was interrupted before starting")
|
||||||
return
|
return
|
||||||
maxkb_logger.info(_('Start--->Embedding document: {document_id}').format(document_id=document_id)
|
|
||||||
)
|
maxkb_logger.info(f"🚀 Starting embedding for document: {document_id}")
|
||||||
# 批量修改状态为STARTED
|
# 批量修改状态为STARTED
|
||||||
ListenerManagement.update_status(QuerySet(Document).filter(id=document_id), TaskType.EMBEDDING,
|
ListenerManagement.update_status(QuerySet(Document).filter(id=document_id), TaskType.EMBEDDING,
|
||||||
State.STARTED)
|
State.STARTED)
|
||||||
|
|
||||||
# 根据段落进行向量化处理
|
# 获取需要处理的段落
|
||||||
page_desc(QuerySet(Paragraph)
|
paragraph_queryset = QuerySet(Paragraph).annotate(
|
||||||
.annotate(
|
|
||||||
reversed_status=Reverse('status'),
|
reversed_status=Reverse('status'),
|
||||||
task_type_status=Substr('reversed_status', TaskType.EMBEDDING.value,
|
task_type_status=Substr('reversed_status', TaskType.EMBEDDING.value, 1),
|
||||||
1),
|
).filter(task_type_status__in=state_list, document_id=document_id).values('id')
|
||||||
).filter(task_type_status__in=state_list, document_id=document_id)
|
|
||||||
.values('id'), 5,
|
# 统计需要处理的段落数量
|
||||||
|
total_paragraphs = paragraph_queryset.count()
|
||||||
|
maxkb_logger.info(f"📊 Found {total_paragraphs} paragraphs to embed for document {document_id}")
|
||||||
|
|
||||||
|
if total_paragraphs == 0:
|
||||||
|
maxkb_logger.warning(f"⚠️ No paragraphs found for embedding in document {document_id}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 根据段落进行向量化处理(每批5个)
|
||||||
|
batch_size = 5
|
||||||
|
maxkb_logger.info(f"🔄 Processing paragraphs in batches of {batch_size}")
|
||||||
|
page_desc(paragraph_queryset, batch_size,
|
||||||
ListenerManagement.get_embedding_paragraph_apply(embedding_model, is_the_task_interrupted,
|
ListenerManagement.get_embedding_paragraph_apply(embedding_model, is_the_task_interrupted,
|
||||||
ListenerManagement.get_aggregation_document_status(
|
ListenerManagement.get_aggregation_document_status(
|
||||||
document_id)),
|
document_id)),
|
||||||
|
|||||||
@ -31,29 +31,50 @@ class MediaSplitHandle(BaseSplitHandle):
|
|||||||
"""处理音视频文件"""
|
"""处理音视频文件"""
|
||||||
|
|
||||||
maxkb_logger.info(f"MediaSplitHandle.handle called with file: {file.name}")
|
maxkb_logger.info(f"MediaSplitHandle.handle called with file: {file.name}")
|
||||||
maxkb_logger.info(f"Split parameters - limit: {limit}, patterns: {pattern_list}, with_filter: {with_filter}")
|
maxkb_logger.info(f"📋 Split parameters - limit: {limit}, patterns: {pattern_list}, with_filter: {with_filter}")
|
||||||
|
|
||||||
# 检查是否需要实际处理
|
# 检查是否需要实际处理
|
||||||
use_actual_processing = kwargs.get('use_actual_processing', False)
|
use_actual_processing = kwargs.get('use_actual_processing', False)
|
||||||
stt_model_id = kwargs.get('stt_model_id')
|
stt_model_id = kwargs.get('stt_model_id')
|
||||||
|
llm_model_id = kwargs.get('llm_model_id')
|
||||||
|
workspace_id = kwargs.get('workspace_id')
|
||||||
|
|
||||||
|
maxkb_logger.info(f"🎯 Processing mode: {'Actual processing' if use_actual_processing else 'Default text'}")
|
||||||
|
maxkb_logger.info(f"🔧 Model IDs - STT: {stt_model_id}, LLM: {llm_model_id}, Workspace: {workspace_id}")
|
||||||
|
|
||||||
if use_actual_processing and stt_model_id:
|
if use_actual_processing and stt_model_id:
|
||||||
# 进行实际处理
|
# 进行实际处理
|
||||||
|
maxkb_logger.info(f"🎬 Starting actual media processing for {file.name}")
|
||||||
result = self._handle_actual_processing(file, get_buffer, **kwargs)
|
result = self._handle_actual_processing(file, get_buffer, **kwargs)
|
||||||
|
|
||||||
|
# 检查处理结果
|
||||||
|
paragraphs_count = len(result.get('content', []))
|
||||||
|
maxkb_logger.info(f"📝 Actual processing generated {paragraphs_count} paragraphs")
|
||||||
|
|
||||||
# 应用智能分块(如果需要)
|
# 应用智能分块(如果需要)
|
||||||
chunk_limit = limit if limit > 0 else 1000 # 默认1000字符
|
chunk_limit = limit if limit > 0 else 1000 # 默认1000字符
|
||||||
if len(result.get('content', [])) > 0:
|
maxkb_logger.info(f"🔧 Chunk limit set to: {chunk_limit}")
|
||||||
|
|
||||||
|
if paragraphs_count > 0:
|
||||||
|
maxkb_logger.info(f"✂️ Applying smart split to {paragraphs_count} paragraphs")
|
||||||
result = self._apply_smart_split(result, chunk_limit, with_filter)
|
result = self._apply_smart_split(result, chunk_limit, with_filter)
|
||||||
|
final_chunks = len(result.get('content', []))
|
||||||
|
maxkb_logger.info(f"✅ Smart split completed, final chunks: {final_chunks}")
|
||||||
|
else:
|
||||||
|
maxkb_logger.warning(f"⚠️ No paragraphs generated from actual processing")
|
||||||
|
|
||||||
return result
|
return result
|
||||||
else:
|
else:
|
||||||
# 使用默认文本
|
# 使用默认文本
|
||||||
|
maxkb_logger.info(f"📄 Using default text mode for {file.name}")
|
||||||
result = self._handle_default_text(file, **kwargs)
|
result = self._handle_default_text(file, **kwargs)
|
||||||
|
|
||||||
# 即使是默认文本也应用分块(如果有limit参数)
|
# 即使是默认文本也应用分块(如果有limit参数)
|
||||||
if limit > 0:
|
if limit > 0:
|
||||||
|
maxkb_logger.info(f"✂️ Applying smart split to default text with limit {limit}")
|
||||||
result = self._apply_smart_split(result, limit, with_filter)
|
result = self._apply_smart_split(result, limit, with_filter)
|
||||||
|
final_chunks = len(result.get('content', []))
|
||||||
|
maxkb_logger.info(f"✅ Smart split completed, final chunks: {final_chunks}")
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@ -362,43 +383,68 @@ class MediaSplitHandle(BaseSplitHandle):
|
|||||||
"""应用智能分块到转录结果"""
|
"""应用智能分块到转录结果"""
|
||||||
overlap = 100 # 前后重叠字符数
|
overlap = 100 # 前后重叠字符数
|
||||||
|
|
||||||
|
maxkb_logger.info(f"🔧 Starting smart split process - limit: {limit}, with_filter: {with_filter}")
|
||||||
|
original_paragraphs = result.get('content', [])
|
||||||
|
maxkb_logger.info(f"📊 Original paragraphs count: {len(original_paragraphs)}")
|
||||||
|
|
||||||
new_paragraphs = []
|
new_paragraphs = []
|
||||||
for paragraph in result.get('content', []):
|
total_chunks_created = 0
|
||||||
|
|
||||||
|
for idx, paragraph in enumerate(original_paragraphs):
|
||||||
|
maxkb_logger.info(f"🔍 Processing paragraph {idx+1}/{len(original_paragraphs)}: {paragraph.get('title', 'Untitled')}")
|
||||||
content = paragraph.get('content', '')
|
content = paragraph.get('content', '')
|
||||||
|
content_length = len(content)
|
||||||
|
maxkb_logger.info(f"📏 Paragraph {idx+1} content length: {content_length} characters")
|
||||||
|
|
||||||
# 应用文本过滤(如果需要)
|
# 应用文本过滤(如果需要)
|
||||||
if with_filter:
|
if with_filter:
|
||||||
|
maxkb_logger.info(f"🧹 Applying text filter to paragraph {idx+1}")
|
||||||
content = self._clean_text(content)
|
content = self._clean_text(content)
|
||||||
|
maxkb_logger.info(f"🧹 Filtered content length: {len(content)} characters")
|
||||||
|
|
||||||
if content:
|
if content:
|
||||||
# 应用智能分块
|
# 判断是否需要分块
|
||||||
chunks = self.smart_split_transcription(content, limit, overlap)
|
if content_length > limit:
|
||||||
|
maxkb_logger.info(f"✂️ Paragraph {idx+1} needs splitting (length {content_length} > limit {limit})")
|
||||||
|
|
||||||
|
# 应用智能分块
|
||||||
|
chunks = self.smart_split_transcription(content, limit, overlap)
|
||||||
|
maxkb_logger.info(f"✂️ Split paragraph {idx+1} into {len(chunks)} chunks")
|
||||||
|
|
||||||
|
# 记录每个chunk的详细信息
|
||||||
|
for c_idx, chunk in enumerate(chunks):
|
||||||
|
maxkb_logger.info(f"📦 Chunk {c_idx+1}/{len(chunks)}: length={len(chunk)}, preview='{chunk[:50]}...'")
|
||||||
|
|
||||||
# 如果只有一个块且没有变化,保持原样
|
|
||||||
if len(chunks) == 1 and chunks[0] == content:
|
|
||||||
new_paragraphs.append(paragraph)
|
|
||||||
else:
|
|
||||||
# 创建新的段落
|
# 创建新的段落
|
||||||
for idx, chunk in enumerate(chunks):
|
for c_idx, chunk in enumerate(chunks):
|
||||||
# 保留原始元数据,但更新分段相关信息
|
# 保留原始元数据,但更新分段相关信息
|
||||||
metadata = paragraph.get('metadata', {}).copy()
|
metadata = paragraph.get('metadata', {}).copy()
|
||||||
metadata.update({
|
metadata.update({
|
||||||
'chunk_index': idx,
|
'chunk_index': c_idx,
|
||||||
'total_chunks': len(chunks),
|
'total_chunks': len(chunks),
|
||||||
'split_method': 'smart_transcription',
|
'split_method': 'smart_transcription',
|
||||||
'split_limit': limit,
|
'split_limit': limit,
|
||||||
'split_overlap': overlap,
|
'split_overlap': overlap,
|
||||||
'with_filter': with_filter
|
'with_filter': with_filter,
|
||||||
|
'original_paragraph_index': idx,
|
||||||
|
'original_content_length': content_length
|
||||||
})
|
})
|
||||||
|
|
||||||
new_paragraph = {
|
new_paragraph = {
|
||||||
'content': chunk,
|
'content': chunk,
|
||||||
'title': f"{paragraph.get('title', '段落')} - 第{idx + 1}部分" if len(chunks) > 1 else paragraph.get('title', '段落'),
|
'title': f"{paragraph.get('title', '段落')} - 第{c_idx + 1}部分" if len(chunks) > 1 else paragraph.get('title', '段落'),
|
||||||
'metadata': metadata
|
'metadata': metadata
|
||||||
}
|
}
|
||||||
new_paragraphs.append(new_paragraph)
|
new_paragraphs.append(new_paragraph)
|
||||||
|
total_chunks_created += 1
|
||||||
|
else:
|
||||||
|
maxkb_logger.info(f"📄 Paragraph {idx+1} does not need splitting (length {content_length} <= limit {limit})")
|
||||||
|
new_paragraphs.append(paragraph)
|
||||||
|
total_chunks_created += 1
|
||||||
else:
|
else:
|
||||||
|
maxkb_logger.warning(f"⚠️ Paragraph {idx+1} has empty content after processing")
|
||||||
new_paragraphs.append(paragraph)
|
new_paragraphs.append(paragraph)
|
||||||
|
total_chunks_created += 1
|
||||||
|
|
||||||
# 更新结果
|
# 更新结果
|
||||||
result['content'] = new_paragraphs
|
result['content'] = new_paragraphs
|
||||||
@ -407,9 +453,16 @@ class MediaSplitHandle(BaseSplitHandle):
|
|||||||
metadata = result.get('metadata', {})
|
metadata = result.get('metadata', {})
|
||||||
metadata['smart_split_applied'] = True
|
metadata['smart_split_applied'] = True
|
||||||
metadata['total_chunks'] = len(new_paragraphs)
|
metadata['total_chunks'] = len(new_paragraphs)
|
||||||
|
metadata['original_paragraphs'] = len(original_paragraphs)
|
||||||
|
metadata['split_parameters'] = {
|
||||||
|
'limit': limit,
|
||||||
|
'overlap': overlap,
|
||||||
|
'with_filter': with_filter
|
||||||
|
}
|
||||||
result['metadata'] = metadata
|
result['metadata'] = metadata
|
||||||
|
|
||||||
maxkb_logger.info(f"Applied smart transcription split: {len(new_paragraphs)} chunks")
|
maxkb_logger.info(f"✅ Smart split completed - original: {len(original_paragraphs)} paragraphs, final: {len(new_paragraphs)} chunks")
|
||||||
|
maxkb_logger.info(f"📈 Total chunks created: {total_chunks_created}")
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|||||||
@ -37,11 +37,26 @@ def page_desc(query_set, page_size, handler, is_the_task_interrupted=lambda: Fal
|
|||||||
@param is_the_task_interrupted: 任务是否被中断
|
@param is_the_task_interrupted: 任务是否被中断
|
||||||
@return:
|
@return:
|
||||||
"""
|
"""
|
||||||
|
from common.utils.logger import maxkb_logger
|
||||||
|
|
||||||
query = query_set.order_by("id")
|
query = query_set.order_by("id")
|
||||||
count = query_set.count()
|
count = query_set.count()
|
||||||
|
|
||||||
|
maxkb_logger.info(f"🔍 page_desc: Processing {count} items in batches of {page_size}")
|
||||||
|
maxkb_logger.info(f"📊 Total batches to process: {ceil(count / page_size)}")
|
||||||
|
|
||||||
|
batch_count = 0
|
||||||
for i in sorted(range(0, ceil(count / page_size)), reverse=True):
|
for i in sorted(range(0, ceil(count / page_size)), reverse=True):
|
||||||
if is_the_task_interrupted():
|
if is_the_task_interrupted():
|
||||||
|
maxkb_logger.warning(f"⚠️ page_desc: Task interrupted during batch processing")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
batch_count += 1
|
||||||
offset = i * page_size
|
offset = i * page_size
|
||||||
paragraph_list = query.all()[offset: offset + page_size]
|
paragraph_list = query.all()[offset: offset + page_size]
|
||||||
|
|
||||||
|
maxkb_logger.info(f"🔄 Processing batch {batch_count}/{ceil(count / page_size)}: {len(paragraph_list)} items")
|
||||||
handler(paragraph_list)
|
handler(paragraph_list)
|
||||||
|
maxkb_logger.info(f"✅ Completed batch {batch_count}")
|
||||||
|
|
||||||
|
maxkb_logger.info(f"🎉 page_desc: Completed all {batch_count} batches")
|
||||||
|
|||||||
@ -144,16 +144,25 @@ def embedding_by_data_source(document_id, knowledge_id, workspace_id):
|
|||||||
@param knowledge_id: 知识库id
|
@param knowledge_id: 知识库id
|
||||||
@param workspace_id: 工作空间id
|
@param workspace_id: 工作空间id
|
||||||
"""
|
"""
|
||||||
|
maxkb_logger.info(f"🔍 Starting embedding_by_data_source for document {document_id}")
|
||||||
|
maxkb_logger.info(f"📊 Parameters - knowledge_id: {knowledge_id}, workspace_id: {workspace_id}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from knowledge.serializers.common import get_embedding_model_id_by_knowledge_id
|
from knowledge.serializers.common import get_embedding_model_id_by_knowledge_id
|
||||||
|
maxkb_logger.info(f"🔧 Getting embedding model ID for knowledge base {knowledge_id}")
|
||||||
embedding_model_id = get_embedding_model_id_by_knowledge_id(knowledge_id)
|
embedding_model_id = get_embedding_model_id_by_knowledge_id(knowledge_id)
|
||||||
|
|
||||||
if embedding_model_id:
|
if embedding_model_id:
|
||||||
|
maxkb_logger.info(f"✅ Found embedding model: {embedding_model_id}")
|
||||||
|
maxkb_logger.info(f"🚀 Submitting embedding task for document {document_id}")
|
||||||
embedding_by_document.delay(document_id, embedding_model_id)
|
embedding_by_document.delay(document_id, embedding_model_id)
|
||||||
maxkb_logger.info(f"Started embedding for document {document_id} with model {embedding_model_id}")
|
maxkb_logger.info(f"✅ Successfully submitted embedding task for document {document_id} with model {embedding_model_id}")
|
||||||
else:
|
else:
|
||||||
maxkb_logger.warning(f"No embedding model found for knowledge {knowledge_id}")
|
maxkb_logger.error(f"❌ No embedding model found for knowledge base {knowledge_id}")
|
||||||
|
maxkb_logger.error(f"⚠️ Cannot proceed with embedding for document {document_id}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
maxkb_logger.error(f"Failed to start embedding for document {document_id}: {str(e)}")
|
maxkb_logger.error(f"❌ Failed to start embedding for document {document_id}: {str(e)}")
|
||||||
|
maxkb_logger.error(f"🔍 Full error traceback: {traceback.format_exc()}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -132,14 +132,22 @@ def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id
|
|||||||
# 状态保持为STARTED,但通过日志区分阶段
|
# 状态保持为STARTED,但通过日志区分阶段
|
||||||
|
|
||||||
# 创建段落对象
|
# 创建段落对象
|
||||||
|
maxkb_logger.info(f"🔧 Starting paragraph creation for document {document_id}")
|
||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
paragraph_models = []
|
paragraph_models = []
|
||||||
for idx, para_data in enumerate(paragraphs_data):
|
for idx, para_data in enumerate(paragraphs_data):
|
||||||
|
content = para_data.get('content', '')
|
||||||
|
title = para_data.get('title', f'段落 {idx + 1}')
|
||||||
|
content_length = len(content)
|
||||||
|
|
||||||
|
maxkb_logger.info(f"📝 Creating paragraph {idx+1}/{len(paragraphs_data)}: title='{title}', length={content_length}")
|
||||||
|
maxkb_logger.info(f"📄 Paragraph {idx+1} preview: '{content[:100]}...'")
|
||||||
|
|
||||||
paragraph = Paragraph(
|
paragraph = Paragraph(
|
||||||
document_id=document_id,
|
document_id=document_id,
|
||||||
knowledge_id=knowledge_id,
|
knowledge_id=knowledge_id,
|
||||||
content=para_data.get('content', ''),
|
content=content,
|
||||||
title=para_data.get('title', f'段落 {idx + 1}'),
|
title=title,
|
||||||
position=idx + 1,
|
position=idx + 1,
|
||||||
status_meta=para_data.get('metadata', {})
|
status_meta=para_data.get('metadata', {})
|
||||||
)
|
)
|
||||||
@ -147,17 +155,33 @@ def media_learning_by_document(document_id: str, knowledge_id: str, workspace_id
|
|||||||
|
|
||||||
# 批量保存段落
|
# 批量保存段落
|
||||||
if paragraph_models:
|
if paragraph_models:
|
||||||
|
maxkb_logger.info(f"💾 Saving {len(paragraph_models)} paragraphs to database")
|
||||||
QuerySet(Paragraph).bulk_create(paragraph_models)
|
QuerySet(Paragraph).bulk_create(paragraph_models)
|
||||||
maxkb_logger.info(f"✅ Created {len(paragraph_models)} paragraphs for document {document_id}")
|
maxkb_logger.info(f"✅ Successfully created {len(paragraph_models)} paragraphs for document {document_id}")
|
||||||
|
|
||||||
# 更新文档字符长度
|
# 统计信息
|
||||||
total_char_length = sum(len(p.content) for p in paragraph_models)
|
total_char_length = sum(len(p.content) for p in paragraph_models)
|
||||||
document.char_length = total_char_length
|
avg_char_length = total_char_length / len(paragraph_models)
|
||||||
document.save()
|
maxkb_logger.info(f"📊 Paragraph statistics - total chars: {total_char_length}, average: {avg_char_length:.2f}")
|
||||||
|
|
||||||
|
# 更新文档字符长度
|
||||||
|
document.char_length = total_char_length
|
||||||
|
document.save()
|
||||||
|
maxkb_logger.info(f"📄 Updated document char_length to {total_char_length}")
|
||||||
|
else:
|
||||||
|
maxkb_logger.warning(f"⚠️ No paragraph models to save for document {document_id}")
|
||||||
|
|
||||||
# 第3步:触发向量化任务
|
# 第3步:触发向量化任务
|
||||||
maxkb_logger.info(f"🔍 Starting embedding for document: {document_id}")
|
maxkb_logger.info(f"🔍 Starting embedding process for document: {document_id}")
|
||||||
embedding_by_data_source(document_id, knowledge_id, workspace_id)
|
maxkb_logger.info(f"📊 Document {document_id} has {len(paragraph_models)} paragraphs to embed")
|
||||||
|
|
||||||
|
try:
|
||||||
|
embedding_by_data_source(document_id, knowledge_id, workspace_id)
|
||||||
|
maxkb_logger.info(f"✅ Embedding task successfully submitted for document {document_id}")
|
||||||
|
except Exception as embedding_error:
|
||||||
|
maxkb_logger.error(f"❌ Failed to start embedding for document {document_id}: {str(embedding_error)}")
|
||||||
|
maxkb_logger.error(f"🔍 Embedding error details: {traceback.format_exc()}")
|
||||||
|
# 注意:这里不抛出异常,让文档状态保持成功,但记录embedding失败
|
||||||
|
|
||||||
# 第4步:更新状态为完成
|
# 第4步:更新状态为完成
|
||||||
maxkb_logger.info(f"✅ Updating status to: SUCCESS (完成)")
|
maxkb_logger.info(f"✅ Updating status to: SUCCESS (完成)")
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user