From 9d65e181eb048d3eed2f690b9819b8ef5cff37e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= Date: Wed, 27 Aug 2025 01:41:47 +0800 Subject: [PATCH] add log --- apps/knowledge/vector/pg_vector.py | 54 ++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/apps/knowledge/vector/pg_vector.py b/apps/knowledge/vector/pg_vector.py index dcbe2757..0d19689a 100644 --- a/apps/knowledge/vector/pg_vector.py +++ b/apps/knowledge/vector/pg_vector.py @@ -212,6 +212,56 @@ class EmbeddingSearch(ISearch): from common.utils.logger import maxkb_logger maxkb_logger.info(f"EmbeddingSearch: Executing search with similarity threshold={similarity}, top_n={top_number}") + # 先查询所有结果不设置相似度阈值,看看实际的相似度是多少 + test_sql = """ + SELECT + paragraph_id, + comprehensive_score, + comprehensive_score as similarity + FROM + ( + SELECT DISTINCT ON + ("paragraph_id") ( 1 - distince ),* ,(1 - distince) AS comprehensive_score + FROM + ( SELECT *, ( embedding.embedding::vector(%s) <=> %s ) AS distince FROM embedding ${embedding_query} ORDER BY distince) TEMP + ORDER BY + paragraph_id, + distince + ) DISTINCT_TEMP + ORDER BY comprehensive_score DESC + LIMIT %s + """ + + test_exec_sql, test_exec_params = generate_sql_by_query_dict({'embedding_query': query_set}, + select_string=test_sql, + with_table_name=True) + + # 查询不带阈值的结果 + test_results = select_list(test_exec_sql, [ + len(query_embedding), + json.dumps(query_embedding), + *test_exec_params, + 10 # 获取前10个结果 + ]) + + if len(test_results) > 0: + test_similarities = [r.get('similarity', 0) for r in test_results[:5]] + maxkb_logger.info(f"Actual similarities (no threshold): {test_similarities}") + maxkb_logger.info(f"Highest similarity: {test_similarities[0] if test_similarities else 0}, Required threshold: {similarity}") + if test_similarities[0] < similarity: + maxkb_logger.warning(f"Best similarity {test_similarities[0]} is below threshold {similarity}") + # 获取段落内容看看 + if len(test_results) > 0: + paragraph_id = test_results[0].get('paragraph_id') + from knowledge.models import Paragraph + para = QuerySet(Paragraph).filter(id=paragraph_id).first() + if para: + maxkb_logger.info(f"Top paragraph content preview (first 200 chars): {para.content[:200]}...") + maxkb_logger.info(f"Paragraph title: {para.title}, length: {len(para.content)}") + else: + maxkb_logger.warning("No embeddings found even without similarity threshold") + + # 正常查询(带相似度阈值) exec_sql, exec_params = generate_sql_by_query_dict({'embedding_query': query_set}, select_string=get_file_content( os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', @@ -228,10 +278,10 @@ class EmbeddingSearch(ISearch): top_number ]) - maxkb_logger.info(f"EmbeddingSearch results: {len(embedding_model)} embeddings found") + maxkb_logger.info(f"EmbeddingSearch results: {len(embedding_model)} embeddings found (with threshold)") if len(embedding_model) > 0: similarities = [e.get('similarity', 0) for e in embedding_model[:3]] - maxkb_logger.info(f"Top 3 similarities: {similarities}") + maxkb_logger.info(f"Top 3 similarities above threshold: {similarities}") return embedding_model