add log
Some checks are pending
sync2gitee / repo-sync (push) Waiting to run
Typos Check / Spell Check with Typos (push) Waiting to run

This commit is contained in:
朱潮 2025-08-27 01:41:47 +08:00
parent 788491db61
commit 9d65e181eb

View File

@ -212,6 +212,56 @@ class EmbeddingSearch(ISearch):
from common.utils.logger import maxkb_logger
maxkb_logger.info(f"EmbeddingSearch: Executing search with similarity threshold={similarity}, top_n={top_number}")
# 先查询所有结果不设置相似度阈值,看看实际的相似度是多少
test_sql = """
SELECT
paragraph_id,
comprehensive_score,
comprehensive_score as similarity
FROM
(
SELECT DISTINCT ON
("paragraph_id") ( 1 - distince ),* ,(1 - distince) AS comprehensive_score
FROM
( SELECT *, ( embedding.embedding::vector(%s) <=> %s ) AS distince FROM embedding ${embedding_query} ORDER BY distince) TEMP
ORDER BY
paragraph_id,
distince
) DISTINCT_TEMP
ORDER BY comprehensive_score DESC
LIMIT %s
"""
test_exec_sql, test_exec_params = generate_sql_by_query_dict({'embedding_query': query_set},
select_string=test_sql,
with_table_name=True)
# 查询不带阈值的结果
test_results = select_list(test_exec_sql, [
len(query_embedding),
json.dumps(query_embedding),
*test_exec_params,
10 # 获取前10个结果
])
if len(test_results) > 0:
test_similarities = [r.get('similarity', 0) for r in test_results[:5]]
maxkb_logger.info(f"Actual similarities (no threshold): {test_similarities}")
maxkb_logger.info(f"Highest similarity: {test_similarities[0] if test_similarities else 0}, Required threshold: {similarity}")
if test_similarities[0] < similarity:
maxkb_logger.warning(f"Best similarity {test_similarities[0]} is below threshold {similarity}")
# 获取段落内容看看
if len(test_results) > 0:
paragraph_id = test_results[0].get('paragraph_id')
from knowledge.models import Paragraph
para = QuerySet(Paragraph).filter(id=paragraph_id).first()
if para:
maxkb_logger.info(f"Top paragraph content preview (first 200 chars): {para.content[:200]}...")
maxkb_logger.info(f"Paragraph title: {para.title}, length: {len(para.content)}")
else:
maxkb_logger.warning("No embeddings found even without similarity threshold")
# 正常查询(带相似度阈值)
exec_sql, exec_params = generate_sql_by_query_dict({'embedding_query': query_set},
select_string=get_file_content(
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql',
@ -228,10 +278,10 @@ class EmbeddingSearch(ISearch):
top_number
])
maxkb_logger.info(f"EmbeddingSearch results: {len(embedding_model)} embeddings found")
maxkb_logger.info(f"EmbeddingSearch results: {len(embedding_model)} embeddings found (with threshold)")
if len(embedding_model) > 0:
similarities = [e.get('similarity', 0) for e in embedding_model[:3]]
maxkb_logger.info(f"Top 3 similarities: {similarities}")
maxkb_logger.info(f"Top 3 similarities above threshold: {similarities}")
return embedding_model