add log
This commit is contained in:
parent
4930ef71f7
commit
788491db61
@ -65,14 +65,23 @@ class BaseSearchKnowledgeNode(ISearchKnowledgeStepNode):
|
||||
def execute(self, knowledge_id_list, knowledge_setting, question, show_knowledge,
|
||||
exclude_paragraph_id_list=None,
|
||||
**kwargs) -> NodeResult:
|
||||
from common.utils.logger import maxkb_logger
|
||||
self.context['question'] = question
|
||||
self.context['show_knowledge'] = show_knowledge
|
||||
|
||||
maxkb_logger.info(f"SearchKnowledgeNode: Starting search for question: '{question[:100]}...'")
|
||||
maxkb_logger.info(f"SearchKnowledgeNode: Knowledge IDs: {knowledge_id_list}")
|
||||
maxkb_logger.info(f"SearchKnowledgeNode: Settings - top_n={knowledge_setting.get('top_n')}, "
|
||||
f"similarity={knowledge_setting.get('similarity')}, "
|
||||
f"search_mode={knowledge_setting.get('search_mode')}")
|
||||
|
||||
get_knowledge_list_of_authorized = DatabaseModelManage.get_model('get_knowledge_list_of_authorized')
|
||||
chat_user_type = self.workflow_manage.get_body().get('chat_user_type')
|
||||
if get_knowledge_list_of_authorized is not None and RoleConstants.CHAT_USER.value.name == chat_user_type:
|
||||
knowledge_id_list = get_knowledge_list_of_authorized(self.workflow_manage.get_body().get('chat_user_id'),
|
||||
knowledge_id_list)
|
||||
if len(knowledge_id_list) == 0:
|
||||
maxkb_logger.warning("SearchKnowledgeNode: No authorized knowledge bases")
|
||||
return get_none_result(question)
|
||||
model_id = get_embedding_id(knowledge_id_list)
|
||||
workspace_id = self.workflow_manage.get_body().get('workspace_id')
|
||||
@ -83,17 +92,33 @@ class BaseSearchKnowledgeNode(ISearchKnowledgeStepNode):
|
||||
QuerySet(Document).filter(
|
||||
knowledge_id__in=knowledge_id_list,
|
||||
is_active=False)]
|
||||
|
||||
maxkb_logger.info(f"SearchKnowledgeNode: Excluded {len(exclude_document_id_list)} inactive documents")
|
||||
|
||||
embedding_list = vector.query(question, embedding_value, knowledge_id_list, exclude_document_id_list,
|
||||
exclude_paragraph_id_list, True, knowledge_setting.get('top_n'),
|
||||
knowledge_setting.get('similarity'),
|
||||
SearchMode(knowledge_setting.get('search_mode')))
|
||||
# 手动关闭数据库连接
|
||||
connection.close()
|
||||
|
||||
maxkb_logger.info(f"SearchKnowledgeNode: Query returned {len(embedding_list) if embedding_list else 0} embeddings")
|
||||
|
||||
if embedding_list is None:
|
||||
maxkb_logger.warning("SearchKnowledgeNode: No embeddings found")
|
||||
return get_none_result(question)
|
||||
paragraph_list = self.list_paragraph(embedding_list, vector)
|
||||
|
||||
maxkb_logger.info(f"SearchKnowledgeNode: Found {len(paragraph_list)} paragraphs")
|
||||
|
||||
result = [self.reset_paragraph(paragraph, embedding_list) for paragraph in paragraph_list]
|
||||
result = sorted(result, key=lambda p: p.get('similarity'), reverse=True)
|
||||
|
||||
if len(result) > 0:
|
||||
maxkb_logger.info(f"SearchKnowledgeNode: Top result similarity: {result[0].get('similarity')}, "
|
||||
f"content preview: '{result[0].get('content', '')[:100]}...'")
|
||||
else:
|
||||
maxkb_logger.warning("SearchKnowledgeNode: No results after processing")
|
||||
return NodeResult({'paragraph_list': result,
|
||||
'is_hit_handling_method_list': [row for row in result if row.get('is_hit_handling_method')],
|
||||
'data': '\n'.join(
|
||||
|
||||
@ -320,12 +320,16 @@ class ParagraphSerializers(serializers.Serializer):
|
||||
|
||||
@staticmethod
|
||||
def get_paragraph_problem_model(knowledge_id: str, document_id: str, instance: Dict):
|
||||
from knowledge.models import get_default_status
|
||||
paragraph = Paragraph(
|
||||
id=uuid.uuid7(),
|
||||
document_id=document_id,
|
||||
content=instance.get("content"),
|
||||
knowledge_id=knowledge_id,
|
||||
title=instance.get("title") if 'title' in instance else ''
|
||||
title=instance.get("title") if 'title' in instance else '',
|
||||
status=get_default_status(),
|
||||
is_active=True,
|
||||
hit_num=0
|
||||
)
|
||||
problem_paragraph_object_list = [ProblemParagraphObject(
|
||||
knowledge_id, document_id, str(paragraph.id), problem.get('content')
|
||||
|
||||
@ -62,7 +62,17 @@ class PGVector(BaseVectorStore):
|
||||
return True
|
||||
|
||||
def _batch_save(self, text_list: List[Dict], embedding: Embeddings, is_the_task_interrupted):
|
||||
from common.utils.logger import maxkb_logger
|
||||
texts = [row.get('text') for row in text_list]
|
||||
maxkb_logger.info(f"PGVector batch_save: Processing {len(texts)} texts")
|
||||
|
||||
# Log details of first few items for debugging
|
||||
for i, item in enumerate(text_list[:3]):
|
||||
maxkb_logger.debug(f"Item {i}: document_id={item.get('document_id')}, "
|
||||
f"paragraph_id={item.get('paragraph_id')}, "
|
||||
f"is_active={item.get('is_active', True)}, "
|
||||
f"text_preview='{item.get('text', '')[:50]}...'")
|
||||
|
||||
embeddings = embedding.embed_documents(texts)
|
||||
embedding_list = [
|
||||
Embedding(
|
||||
@ -76,8 +86,17 @@ class PGVector(BaseVectorStore):
|
||||
embedding=[float(x) for x in embeddings[index]],
|
||||
search_vector=SearchVector(Value(to_ts_vector(text_list[index]['text'])))
|
||||
) for index in range(0, len(texts))]
|
||||
|
||||
maxkb_logger.info(f"PGVector batch_save: Created {len(embedding_list)} embedding objects")
|
||||
|
||||
if not is_the_task_interrupted():
|
||||
QuerySet(Embedding).bulk_create(embedding_list) if len(embedding_list) > 0 else None
|
||||
if len(embedding_list) > 0:
|
||||
QuerySet(Embedding).bulk_create(embedding_list)
|
||||
maxkb_logger.info(f"PGVector batch_save: Successfully saved {len(embedding_list)} embeddings to database")
|
||||
else:
|
||||
maxkb_logger.warning("PGVector batch_save: No embeddings to save")
|
||||
else:
|
||||
maxkb_logger.warning("PGVector batch_save: Task interrupted, embeddings not saved")
|
||||
return True
|
||||
|
||||
def hit_test(self, query_text, knowledge_id_list: list[str], exclude_document_id_list: list[str], top_number: int,
|
||||
@ -100,18 +119,41 @@ class PGVector(BaseVectorStore):
|
||||
exclude_document_id_list: list[str],
|
||||
exclude_paragraph_list: list[str], is_active: bool, top_n: int, similarity: float,
|
||||
search_mode: SearchMode):
|
||||
from common.utils.logger import maxkb_logger
|
||||
exclude_dict = {}
|
||||
if knowledge_id_list is None or len(knowledge_id_list) == 0:
|
||||
maxkb_logger.warning("Vector query: knowledge_id_list is empty")
|
||||
return []
|
||||
|
||||
maxkb_logger.info(f"Vector query starting: query_text='{query_text[:50]}...', knowledge_ids={knowledge_id_list}, "
|
||||
f"is_active={is_active}, top_n={top_n}, similarity={similarity}, search_mode={search_mode.value}")
|
||||
|
||||
query_set = QuerySet(Embedding).filter(knowledge_id__in=knowledge_id_list, is_active=is_active)
|
||||
initial_count = query_set.count()
|
||||
maxkb_logger.info(f"Initial embedding count: {initial_count}")
|
||||
|
||||
if exclude_document_id_list is not None and len(exclude_document_id_list) > 0:
|
||||
query_set = query_set.exclude(document_id__in=exclude_document_id_list)
|
||||
maxkb_logger.info(f"After excluding documents: {query_set.count()} embeddings")
|
||||
if exclude_paragraph_list is not None and len(exclude_paragraph_list) > 0:
|
||||
query_set = query_set.exclude(paragraph_id__in=exclude_paragraph_list)
|
||||
maxkb_logger.info(f"After excluding paragraphs: {query_set.count()} embeddings")
|
||||
query_set = query_set.exclude(**exclude_dict)
|
||||
|
||||
final_count = query_set.count()
|
||||
maxkb_logger.info(f"Final embedding count before search: {final_count}")
|
||||
|
||||
for search_handle in search_handle_list:
|
||||
if search_handle.support(search_mode):
|
||||
return search_handle.handle(query_set, query_text, query_embedding, top_n, similarity, search_mode)
|
||||
maxkb_logger.info(f"Using search handler: {search_handle.__class__.__name__}")
|
||||
results = search_handle.handle(query_set, query_text, query_embedding, top_n, similarity, search_mode)
|
||||
maxkb_logger.info(f"Search results: {len(results)} items found")
|
||||
if len(results) > 0:
|
||||
maxkb_logger.info(f"Top result similarity: {results[0].get('similarity', 'N/A')}")
|
||||
return results
|
||||
|
||||
maxkb_logger.warning("No suitable search handler found")
|
||||
return []
|
||||
|
||||
def update_by_source_id(self, source_id: str, instance: Dict):
|
||||
QuerySet(Embedding).filter(source_id=source_id).update(**instance)
|
||||
@ -167,11 +209,17 @@ class EmbeddingSearch(ISearch):
|
||||
top_number: int,
|
||||
similarity: float,
|
||||
search_mode: SearchMode):
|
||||
from common.utils.logger import maxkb_logger
|
||||
maxkb_logger.info(f"EmbeddingSearch: Executing search with similarity threshold={similarity}, top_n={top_number}")
|
||||
|
||||
exec_sql, exec_params = generate_sql_by_query_dict({'embedding_query': query_set},
|
||||
select_string=get_file_content(
|
||||
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql',
|
||||
'embedding_search.sql')),
|
||||
with_table_name=True)
|
||||
|
||||
maxkb_logger.debug(f"EmbeddingSearch SQL params count: {len(exec_params)}")
|
||||
|
||||
embedding_model = select_list(exec_sql, [
|
||||
len(query_embedding),
|
||||
json.dumps(query_embedding),
|
||||
@ -179,6 +227,12 @@ class EmbeddingSearch(ISearch):
|
||||
similarity,
|
||||
top_number
|
||||
])
|
||||
|
||||
maxkb_logger.info(f"EmbeddingSearch results: {len(embedding_model)} embeddings found")
|
||||
if len(embedding_model) > 0:
|
||||
similarities = [e.get('similarity', 0) for e in embedding_model[:3]]
|
||||
maxkb_logger.info(f"Top 3 similarities: {similarities}")
|
||||
|
||||
return embedding_model
|
||||
|
||||
def support(self, search_mode: SearchMode):
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
#!/bin/bash
|
||||
!/bin/bash
|
||||
|
||||
# Docker 镜像重新编译脚本
|
||||
# 用于构建 MaxKB Docker 镜像(包含 MinerU 集成和本地 LibreOffice)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user