From 788491db6135de86ec8f95f324266c36f06c4a2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= Date: Wed, 27 Aug 2025 01:23:26 +0800 Subject: [PATCH] add log --- .../impl/base_search_knowledge_node.py | 25 ++++++++ apps/knowledge/serializers/paragraph.py | 6 +- apps/knowledge/vector/pg_vector.py | 58 ++++++++++++++++++- rebuild-docker.sh | 2 +- 4 files changed, 87 insertions(+), 4 deletions(-) diff --git a/apps/application/flow/step_node/search_knowledge_node/impl/base_search_knowledge_node.py b/apps/application/flow/step_node/search_knowledge_node/impl/base_search_knowledge_node.py index ddbe6759..6a1969c0 100644 --- a/apps/application/flow/step_node/search_knowledge_node/impl/base_search_knowledge_node.py +++ b/apps/application/flow/step_node/search_knowledge_node/impl/base_search_knowledge_node.py @@ -65,14 +65,23 @@ class BaseSearchKnowledgeNode(ISearchKnowledgeStepNode): def execute(self, knowledge_id_list, knowledge_setting, question, show_knowledge, exclude_paragraph_id_list=None, **kwargs) -> NodeResult: + from common.utils.logger import maxkb_logger self.context['question'] = question self.context['show_knowledge'] = show_knowledge + + maxkb_logger.info(f"SearchKnowledgeNode: Starting search for question: '{question[:100]}...'") + maxkb_logger.info(f"SearchKnowledgeNode: Knowledge IDs: {knowledge_id_list}") + maxkb_logger.info(f"SearchKnowledgeNode: Settings - top_n={knowledge_setting.get('top_n')}, " + f"similarity={knowledge_setting.get('similarity')}, " + f"search_mode={knowledge_setting.get('search_mode')}") + get_knowledge_list_of_authorized = DatabaseModelManage.get_model('get_knowledge_list_of_authorized') chat_user_type = self.workflow_manage.get_body().get('chat_user_type') if get_knowledge_list_of_authorized is not None and RoleConstants.CHAT_USER.value.name == chat_user_type: knowledge_id_list = get_knowledge_list_of_authorized(self.workflow_manage.get_body().get('chat_user_id'), knowledge_id_list) if len(knowledge_id_list) == 0: + maxkb_logger.warning("SearchKnowledgeNode: No authorized knowledge bases") return get_none_result(question) model_id = get_embedding_id(knowledge_id_list) workspace_id = self.workflow_manage.get_body().get('workspace_id') @@ -83,17 +92,33 @@ class BaseSearchKnowledgeNode(ISearchKnowledgeStepNode): QuerySet(Document).filter( knowledge_id__in=knowledge_id_list, is_active=False)] + + maxkb_logger.info(f"SearchKnowledgeNode: Excluded {len(exclude_document_id_list)} inactive documents") + embedding_list = vector.query(question, embedding_value, knowledge_id_list, exclude_document_id_list, exclude_paragraph_id_list, True, knowledge_setting.get('top_n'), knowledge_setting.get('similarity'), SearchMode(knowledge_setting.get('search_mode'))) # 手动关闭数据库连接 connection.close() + + maxkb_logger.info(f"SearchKnowledgeNode: Query returned {len(embedding_list) if embedding_list else 0} embeddings") + if embedding_list is None: + maxkb_logger.warning("SearchKnowledgeNode: No embeddings found") return get_none_result(question) paragraph_list = self.list_paragraph(embedding_list, vector) + + maxkb_logger.info(f"SearchKnowledgeNode: Found {len(paragraph_list)} paragraphs") + result = [self.reset_paragraph(paragraph, embedding_list) for paragraph in paragraph_list] result = sorted(result, key=lambda p: p.get('similarity'), reverse=True) + + if len(result) > 0: + maxkb_logger.info(f"SearchKnowledgeNode: Top result similarity: {result[0].get('similarity')}, " + f"content preview: '{result[0].get('content', '')[:100]}...'") + else: + maxkb_logger.warning("SearchKnowledgeNode: No results after processing") return NodeResult({'paragraph_list': result, 'is_hit_handling_method_list': [row for row in result if row.get('is_hit_handling_method')], 'data': '\n'.join( diff --git a/apps/knowledge/serializers/paragraph.py b/apps/knowledge/serializers/paragraph.py index 3e06dafb..426a36a9 100644 --- a/apps/knowledge/serializers/paragraph.py +++ b/apps/knowledge/serializers/paragraph.py @@ -320,12 +320,16 @@ class ParagraphSerializers(serializers.Serializer): @staticmethod def get_paragraph_problem_model(knowledge_id: str, document_id: str, instance: Dict): + from knowledge.models import get_default_status paragraph = Paragraph( id=uuid.uuid7(), document_id=document_id, content=instance.get("content"), knowledge_id=knowledge_id, - title=instance.get("title") if 'title' in instance else '' + title=instance.get("title") if 'title' in instance else '', + status=get_default_status(), + is_active=True, + hit_num=0 ) problem_paragraph_object_list = [ProblemParagraphObject( knowledge_id, document_id, str(paragraph.id), problem.get('content') diff --git a/apps/knowledge/vector/pg_vector.py b/apps/knowledge/vector/pg_vector.py index f787cd83..dcbe2757 100644 --- a/apps/knowledge/vector/pg_vector.py +++ b/apps/knowledge/vector/pg_vector.py @@ -62,7 +62,17 @@ class PGVector(BaseVectorStore): return True def _batch_save(self, text_list: List[Dict], embedding: Embeddings, is_the_task_interrupted): + from common.utils.logger import maxkb_logger texts = [row.get('text') for row in text_list] + maxkb_logger.info(f"PGVector batch_save: Processing {len(texts)} texts") + + # Log details of first few items for debugging + for i, item in enumerate(text_list[:3]): + maxkb_logger.debug(f"Item {i}: document_id={item.get('document_id')}, " + f"paragraph_id={item.get('paragraph_id')}, " + f"is_active={item.get('is_active', True)}, " + f"text_preview='{item.get('text', '')[:50]}...'") + embeddings = embedding.embed_documents(texts) embedding_list = [ Embedding( @@ -76,8 +86,17 @@ class PGVector(BaseVectorStore): embedding=[float(x) for x in embeddings[index]], search_vector=SearchVector(Value(to_ts_vector(text_list[index]['text']))) ) for index in range(0, len(texts))] + + maxkb_logger.info(f"PGVector batch_save: Created {len(embedding_list)} embedding objects") + if not is_the_task_interrupted(): - QuerySet(Embedding).bulk_create(embedding_list) if len(embedding_list) > 0 else None + if len(embedding_list) > 0: + QuerySet(Embedding).bulk_create(embedding_list) + maxkb_logger.info(f"PGVector batch_save: Successfully saved {len(embedding_list)} embeddings to database") + else: + maxkb_logger.warning("PGVector batch_save: No embeddings to save") + else: + maxkb_logger.warning("PGVector batch_save: Task interrupted, embeddings not saved") return True def hit_test(self, query_text, knowledge_id_list: list[str], exclude_document_id_list: list[str], top_number: int, @@ -100,18 +119,41 @@ class PGVector(BaseVectorStore): exclude_document_id_list: list[str], exclude_paragraph_list: list[str], is_active: bool, top_n: int, similarity: float, search_mode: SearchMode): + from common.utils.logger import maxkb_logger exclude_dict = {} if knowledge_id_list is None or len(knowledge_id_list) == 0: + maxkb_logger.warning("Vector query: knowledge_id_list is empty") return [] + + maxkb_logger.info(f"Vector query starting: query_text='{query_text[:50]}...', knowledge_ids={knowledge_id_list}, " + f"is_active={is_active}, top_n={top_n}, similarity={similarity}, search_mode={search_mode.value}") + query_set = QuerySet(Embedding).filter(knowledge_id__in=knowledge_id_list, is_active=is_active) + initial_count = query_set.count() + maxkb_logger.info(f"Initial embedding count: {initial_count}") + if exclude_document_id_list is not None and len(exclude_document_id_list) > 0: query_set = query_set.exclude(document_id__in=exclude_document_id_list) + maxkb_logger.info(f"After excluding documents: {query_set.count()} embeddings") if exclude_paragraph_list is not None and len(exclude_paragraph_list) > 0: query_set = query_set.exclude(paragraph_id__in=exclude_paragraph_list) + maxkb_logger.info(f"After excluding paragraphs: {query_set.count()} embeddings") query_set = query_set.exclude(**exclude_dict) + + final_count = query_set.count() + maxkb_logger.info(f"Final embedding count before search: {final_count}") + for search_handle in search_handle_list: if search_handle.support(search_mode): - return search_handle.handle(query_set, query_text, query_embedding, top_n, similarity, search_mode) + maxkb_logger.info(f"Using search handler: {search_handle.__class__.__name__}") + results = search_handle.handle(query_set, query_text, query_embedding, top_n, similarity, search_mode) + maxkb_logger.info(f"Search results: {len(results)} items found") + if len(results) > 0: + maxkb_logger.info(f"Top result similarity: {results[0].get('similarity', 'N/A')}") + return results + + maxkb_logger.warning("No suitable search handler found") + return [] def update_by_source_id(self, source_id: str, instance: Dict): QuerySet(Embedding).filter(source_id=source_id).update(**instance) @@ -167,11 +209,17 @@ class EmbeddingSearch(ISearch): top_number: int, similarity: float, search_mode: SearchMode): + from common.utils.logger import maxkb_logger + maxkb_logger.info(f"EmbeddingSearch: Executing search with similarity threshold={similarity}, top_n={top_number}") + exec_sql, exec_params = generate_sql_by_query_dict({'embedding_query': query_set}, select_string=get_file_content( os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'embedding_search.sql')), with_table_name=True) + + maxkb_logger.debug(f"EmbeddingSearch SQL params count: {len(exec_params)}") + embedding_model = select_list(exec_sql, [ len(query_embedding), json.dumps(query_embedding), @@ -179,6 +227,12 @@ class EmbeddingSearch(ISearch): similarity, top_number ]) + + maxkb_logger.info(f"EmbeddingSearch results: {len(embedding_model)} embeddings found") + if len(embedding_model) > 0: + similarities = [e.get('similarity', 0) for e in embedding_model[:3]] + maxkb_logger.info(f"Top 3 similarities: {similarities}") + return embedding_model def support(self, search_mode: SearchMode): diff --git a/rebuild-docker.sh b/rebuild-docker.sh index 4f788ad1..b73e71ad 100755 --- a/rebuild-docker.sh +++ b/rebuild-docker.sh @@ -1,4 +1,4 @@ -#!/bin/bash +!/bin/bash # Docker 镜像重新编译脚本 # 用于构建 MaxKB Docker 镜像(包含 MinerU 集成和本地 LibreOffice)