add log
Some checks are pending
sync2gitee / repo-sync (push) Waiting to run
Typos Check / Spell Check with Typos (push) Waiting to run

This commit is contained in:
朱潮 2025-08-27 01:23:26 +08:00
parent 4930ef71f7
commit 788491db61
4 changed files with 87 additions and 4 deletions

View File

@ -65,14 +65,23 @@ class BaseSearchKnowledgeNode(ISearchKnowledgeStepNode):
def execute(self, knowledge_id_list, knowledge_setting, question, show_knowledge, def execute(self, knowledge_id_list, knowledge_setting, question, show_knowledge,
exclude_paragraph_id_list=None, exclude_paragraph_id_list=None,
**kwargs) -> NodeResult: **kwargs) -> NodeResult:
from common.utils.logger import maxkb_logger
self.context['question'] = question self.context['question'] = question
self.context['show_knowledge'] = show_knowledge self.context['show_knowledge'] = show_knowledge
maxkb_logger.info(f"SearchKnowledgeNode: Starting search for question: '{question[:100]}...'")
maxkb_logger.info(f"SearchKnowledgeNode: Knowledge IDs: {knowledge_id_list}")
maxkb_logger.info(f"SearchKnowledgeNode: Settings - top_n={knowledge_setting.get('top_n')}, "
f"similarity={knowledge_setting.get('similarity')}, "
f"search_mode={knowledge_setting.get('search_mode')}")
get_knowledge_list_of_authorized = DatabaseModelManage.get_model('get_knowledge_list_of_authorized') get_knowledge_list_of_authorized = DatabaseModelManage.get_model('get_knowledge_list_of_authorized')
chat_user_type = self.workflow_manage.get_body().get('chat_user_type') chat_user_type = self.workflow_manage.get_body().get('chat_user_type')
if get_knowledge_list_of_authorized is not None and RoleConstants.CHAT_USER.value.name == chat_user_type: if get_knowledge_list_of_authorized is not None and RoleConstants.CHAT_USER.value.name == chat_user_type:
knowledge_id_list = get_knowledge_list_of_authorized(self.workflow_manage.get_body().get('chat_user_id'), knowledge_id_list = get_knowledge_list_of_authorized(self.workflow_manage.get_body().get('chat_user_id'),
knowledge_id_list) knowledge_id_list)
if len(knowledge_id_list) == 0: if len(knowledge_id_list) == 0:
maxkb_logger.warning("SearchKnowledgeNode: No authorized knowledge bases")
return get_none_result(question) return get_none_result(question)
model_id = get_embedding_id(knowledge_id_list) model_id = get_embedding_id(knowledge_id_list)
workspace_id = self.workflow_manage.get_body().get('workspace_id') workspace_id = self.workflow_manage.get_body().get('workspace_id')
@ -83,17 +92,33 @@ class BaseSearchKnowledgeNode(ISearchKnowledgeStepNode):
QuerySet(Document).filter( QuerySet(Document).filter(
knowledge_id__in=knowledge_id_list, knowledge_id__in=knowledge_id_list,
is_active=False)] is_active=False)]
maxkb_logger.info(f"SearchKnowledgeNode: Excluded {len(exclude_document_id_list)} inactive documents")
embedding_list = vector.query(question, embedding_value, knowledge_id_list, exclude_document_id_list, embedding_list = vector.query(question, embedding_value, knowledge_id_list, exclude_document_id_list,
exclude_paragraph_id_list, True, knowledge_setting.get('top_n'), exclude_paragraph_id_list, True, knowledge_setting.get('top_n'),
knowledge_setting.get('similarity'), knowledge_setting.get('similarity'),
SearchMode(knowledge_setting.get('search_mode'))) SearchMode(knowledge_setting.get('search_mode')))
# 手动关闭数据库连接 # 手动关闭数据库连接
connection.close() connection.close()
maxkb_logger.info(f"SearchKnowledgeNode: Query returned {len(embedding_list) if embedding_list else 0} embeddings")
if embedding_list is None: if embedding_list is None:
maxkb_logger.warning("SearchKnowledgeNode: No embeddings found")
return get_none_result(question) return get_none_result(question)
paragraph_list = self.list_paragraph(embedding_list, vector) paragraph_list = self.list_paragraph(embedding_list, vector)
maxkb_logger.info(f"SearchKnowledgeNode: Found {len(paragraph_list)} paragraphs")
result = [self.reset_paragraph(paragraph, embedding_list) for paragraph in paragraph_list] result = [self.reset_paragraph(paragraph, embedding_list) for paragraph in paragraph_list]
result = sorted(result, key=lambda p: p.get('similarity'), reverse=True) result = sorted(result, key=lambda p: p.get('similarity'), reverse=True)
if len(result) > 0:
maxkb_logger.info(f"SearchKnowledgeNode: Top result similarity: {result[0].get('similarity')}, "
f"content preview: '{result[0].get('content', '')[:100]}...'")
else:
maxkb_logger.warning("SearchKnowledgeNode: No results after processing")
return NodeResult({'paragraph_list': result, return NodeResult({'paragraph_list': result,
'is_hit_handling_method_list': [row for row in result if row.get('is_hit_handling_method')], 'is_hit_handling_method_list': [row for row in result if row.get('is_hit_handling_method')],
'data': '\n'.join( 'data': '\n'.join(

View File

@ -320,12 +320,16 @@ class ParagraphSerializers(serializers.Serializer):
@staticmethod @staticmethod
def get_paragraph_problem_model(knowledge_id: str, document_id: str, instance: Dict): def get_paragraph_problem_model(knowledge_id: str, document_id: str, instance: Dict):
from knowledge.models import get_default_status
paragraph = Paragraph( paragraph = Paragraph(
id=uuid.uuid7(), id=uuid.uuid7(),
document_id=document_id, document_id=document_id,
content=instance.get("content"), content=instance.get("content"),
knowledge_id=knowledge_id, knowledge_id=knowledge_id,
title=instance.get("title") if 'title' in instance else '' title=instance.get("title") if 'title' in instance else '',
status=get_default_status(),
is_active=True,
hit_num=0
) )
problem_paragraph_object_list = [ProblemParagraphObject( problem_paragraph_object_list = [ProblemParagraphObject(
knowledge_id, document_id, str(paragraph.id), problem.get('content') knowledge_id, document_id, str(paragraph.id), problem.get('content')

View File

@ -62,7 +62,17 @@ class PGVector(BaseVectorStore):
return True return True
def _batch_save(self, text_list: List[Dict], embedding: Embeddings, is_the_task_interrupted): def _batch_save(self, text_list: List[Dict], embedding: Embeddings, is_the_task_interrupted):
from common.utils.logger import maxkb_logger
texts = [row.get('text') for row in text_list] texts = [row.get('text') for row in text_list]
maxkb_logger.info(f"PGVector batch_save: Processing {len(texts)} texts")
# Log details of first few items for debugging
for i, item in enumerate(text_list[:3]):
maxkb_logger.debug(f"Item {i}: document_id={item.get('document_id')}, "
f"paragraph_id={item.get('paragraph_id')}, "
f"is_active={item.get('is_active', True)}, "
f"text_preview='{item.get('text', '')[:50]}...'")
embeddings = embedding.embed_documents(texts) embeddings = embedding.embed_documents(texts)
embedding_list = [ embedding_list = [
Embedding( Embedding(
@ -76,8 +86,17 @@ class PGVector(BaseVectorStore):
embedding=[float(x) for x in embeddings[index]], embedding=[float(x) for x in embeddings[index]],
search_vector=SearchVector(Value(to_ts_vector(text_list[index]['text']))) search_vector=SearchVector(Value(to_ts_vector(text_list[index]['text'])))
) for index in range(0, len(texts))] ) for index in range(0, len(texts))]
maxkb_logger.info(f"PGVector batch_save: Created {len(embedding_list)} embedding objects")
if not is_the_task_interrupted(): if not is_the_task_interrupted():
QuerySet(Embedding).bulk_create(embedding_list) if len(embedding_list) > 0 else None if len(embedding_list) > 0:
QuerySet(Embedding).bulk_create(embedding_list)
maxkb_logger.info(f"PGVector batch_save: Successfully saved {len(embedding_list)} embeddings to database")
else:
maxkb_logger.warning("PGVector batch_save: No embeddings to save")
else:
maxkb_logger.warning("PGVector batch_save: Task interrupted, embeddings not saved")
return True return True
def hit_test(self, query_text, knowledge_id_list: list[str], exclude_document_id_list: list[str], top_number: int, def hit_test(self, query_text, knowledge_id_list: list[str], exclude_document_id_list: list[str], top_number: int,
@ -100,18 +119,41 @@ class PGVector(BaseVectorStore):
exclude_document_id_list: list[str], exclude_document_id_list: list[str],
exclude_paragraph_list: list[str], is_active: bool, top_n: int, similarity: float, exclude_paragraph_list: list[str], is_active: bool, top_n: int, similarity: float,
search_mode: SearchMode): search_mode: SearchMode):
from common.utils.logger import maxkb_logger
exclude_dict = {} exclude_dict = {}
if knowledge_id_list is None or len(knowledge_id_list) == 0: if knowledge_id_list is None or len(knowledge_id_list) == 0:
maxkb_logger.warning("Vector query: knowledge_id_list is empty")
return [] return []
maxkb_logger.info(f"Vector query starting: query_text='{query_text[:50]}...', knowledge_ids={knowledge_id_list}, "
f"is_active={is_active}, top_n={top_n}, similarity={similarity}, search_mode={search_mode.value}")
query_set = QuerySet(Embedding).filter(knowledge_id__in=knowledge_id_list, is_active=is_active) query_set = QuerySet(Embedding).filter(knowledge_id__in=knowledge_id_list, is_active=is_active)
initial_count = query_set.count()
maxkb_logger.info(f"Initial embedding count: {initial_count}")
if exclude_document_id_list is not None and len(exclude_document_id_list) > 0: if exclude_document_id_list is not None and len(exclude_document_id_list) > 0:
query_set = query_set.exclude(document_id__in=exclude_document_id_list) query_set = query_set.exclude(document_id__in=exclude_document_id_list)
maxkb_logger.info(f"After excluding documents: {query_set.count()} embeddings")
if exclude_paragraph_list is not None and len(exclude_paragraph_list) > 0: if exclude_paragraph_list is not None and len(exclude_paragraph_list) > 0:
query_set = query_set.exclude(paragraph_id__in=exclude_paragraph_list) query_set = query_set.exclude(paragraph_id__in=exclude_paragraph_list)
maxkb_logger.info(f"After excluding paragraphs: {query_set.count()} embeddings")
query_set = query_set.exclude(**exclude_dict) query_set = query_set.exclude(**exclude_dict)
final_count = query_set.count()
maxkb_logger.info(f"Final embedding count before search: {final_count}")
for search_handle in search_handle_list: for search_handle in search_handle_list:
if search_handle.support(search_mode): if search_handle.support(search_mode):
return search_handle.handle(query_set, query_text, query_embedding, top_n, similarity, search_mode) maxkb_logger.info(f"Using search handler: {search_handle.__class__.__name__}")
results = search_handle.handle(query_set, query_text, query_embedding, top_n, similarity, search_mode)
maxkb_logger.info(f"Search results: {len(results)} items found")
if len(results) > 0:
maxkb_logger.info(f"Top result similarity: {results[0].get('similarity', 'N/A')}")
return results
maxkb_logger.warning("No suitable search handler found")
return []
def update_by_source_id(self, source_id: str, instance: Dict): def update_by_source_id(self, source_id: str, instance: Dict):
QuerySet(Embedding).filter(source_id=source_id).update(**instance) QuerySet(Embedding).filter(source_id=source_id).update(**instance)
@ -167,11 +209,17 @@ class EmbeddingSearch(ISearch):
top_number: int, top_number: int,
similarity: float, similarity: float,
search_mode: SearchMode): search_mode: SearchMode):
from common.utils.logger import maxkb_logger
maxkb_logger.info(f"EmbeddingSearch: Executing search with similarity threshold={similarity}, top_n={top_number}")
exec_sql, exec_params = generate_sql_by_query_dict({'embedding_query': query_set}, exec_sql, exec_params = generate_sql_by_query_dict({'embedding_query': query_set},
select_string=get_file_content( select_string=get_file_content(
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql',
'embedding_search.sql')), 'embedding_search.sql')),
with_table_name=True) with_table_name=True)
maxkb_logger.debug(f"EmbeddingSearch SQL params count: {len(exec_params)}")
embedding_model = select_list(exec_sql, [ embedding_model = select_list(exec_sql, [
len(query_embedding), len(query_embedding),
json.dumps(query_embedding), json.dumps(query_embedding),
@ -179,6 +227,12 @@ class EmbeddingSearch(ISearch):
similarity, similarity,
top_number top_number
]) ])
maxkb_logger.info(f"EmbeddingSearch results: {len(embedding_model)} embeddings found")
if len(embedding_model) > 0:
similarities = [e.get('similarity', 0) for e in embedding_model[:3]]
maxkb_logger.info(f"Top 3 similarities: {similarities}")
return embedding_model return embedding_model
def support(self, search_mode: SearchMode): def support(self, search_mode: SearchMode):

View File

@ -1,4 +1,4 @@
#!/bin/bash !/bin/bash
# Docker 镜像重新编译脚本 # Docker 镜像重新编译脚本
# 用于构建 MaxKB Docker 镜像(包含 MinerU 集成和本地 LibreOffice # 用于构建 MaxKB Docker 镜像(包含 MinerU 集成和本地 LibreOffice