fix: update SQL queries to improve similarity calculations and indexing

This commit is contained in:
CaptainB 2025-07-24 16:18:11 +08:00
parent b3a5dc4a1c
commit 4d18b78d29
4 changed files with 29 additions and 14 deletions

View File

@ -242,7 +242,7 @@ def create_knowledge_index(knowledge_id=None, document_id=None):
if len(result) == 0: if len(result) == 0:
return return
dims = result[0]['dims'] dims = result[0]['dims']
sql = f"""CREATE INDEX "embedding_hnsw_idx_{k_id}" ON embedding USING hnsw ((embedding::vector({dims})) vector_l2_ops) WHERE knowledge_id = '{k_id}'""" sql = f"""CREATE INDEX "embedding_hnsw_idx_{k_id}" ON embedding USING hnsw ((embedding::vector({dims})) vector_cosine_ops) WHERE knowledge_id = '{k_id}'"""
update_execute(sql, []) update_execute(sql, [])
maxkb_logger.info(f'Created index for knowledge ID: {k_id}') maxkb_logger.info(f'Created index for knowledge ID: {k_id}')

View File

@ -5,15 +5,17 @@ SELECT
FROM FROM
( (
SELECT DISTINCT ON SELECT DISTINCT ON
( "paragraph_id" ) ( similarity ),* , ( "paragraph_id" ) ( 1 - distince + ts_similarity ) as similarity, *,
similarity AS comprehensive_score (1 - distince + ts_similarity) AS comprehensive_score
FROM FROM
( (
SELECT SELECT
*, *,
(( 1 - ( embedding.embedding <=> %s ) )+ts_rank_cd( embedding.search_vector, websearch_to_tsquery('simple', %s ), 32 )) AS similarity (embedding.embedding::vector(%s) <=> %s) as distince,
(ts_rank_cd( embedding.search_vector, websearch_to_tsquery('simple', %s ), 32 )) AS ts_similarity
FROM FROM
embedding ${embedding_query} embedding ${embedding_query}
ORDER BY distince
) TEMP ) TEMP
ORDER BY ORDER BY
paragraph_id, paragraph_id,

View File

@ -5,12 +5,12 @@ SELECT
FROM FROM
( (
SELECT DISTINCT ON SELECT DISTINCT ON
("paragraph_id") ( similarity ),* ,similarity AS comprehensive_score ("paragraph_id") ( 1 - distince ),* ,(1 - distince) AS comprehensive_score
FROM FROM
( SELECT *, ( 1 - ( embedding.embedding <=> %s ) ) AS similarity FROM embedding ${embedding_query}) TEMP ( SELECT *, ( embedding.embedding::vector(%s) <=> %s ) AS distince FROM embedding ${embedding_query} ORDER BY distince) TEMP
ORDER BY ORDER BY
paragraph_id, paragraph_id,
similarity DESC distince
) DISTINCT_TEMP ) DISTINCT_TEMP
WHERE comprehensive_score>%s WHERE comprehensive_score>%s
ORDER BY comprehensive_score DESC ORDER BY comprehensive_score DESC

View File

@ -172,8 +172,13 @@ class EmbeddingSearch(ISearch):
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql',
'embedding_search.sql')), 'embedding_search.sql')),
with_table_name=True) with_table_name=True)
embedding_model = select_list(exec_sql, embedding_model = select_list(exec_sql, [
[json.dumps(query_embedding), *exec_params, similarity, top_number]) len(query_embedding),
json.dumps(query_embedding),
*exec_params,
similarity,
top_number
])
return embedding_model return embedding_model
def support(self, search_mode: SearchMode): def support(self, search_mode: SearchMode):
@ -193,8 +198,12 @@ class KeywordsSearch(ISearch):
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql',
'keywords_search.sql')), 'keywords_search.sql')),
with_table_name=True) with_table_name=True)
embedding_model = select_list(exec_sql, embedding_model = select_list(exec_sql, [
[to_query(query_text), *exec_params, similarity, top_number]) to_query(query_text),
*exec_params,
similarity,
top_number
])
return embedding_model return embedding_model
def support(self, search_mode: SearchMode): def support(self, search_mode: SearchMode):
@ -214,9 +223,13 @@ class BlendSearch(ISearch):
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql',
'blend_search.sql')), 'blend_search.sql')),
with_table_name=True) with_table_name=True)
embedding_model = select_list(exec_sql, embedding_model = select_list(exec_sql, [
[json.dumps(query_embedding), to_query(query_text), *exec_params, similarity, len(query_embedding),
top_number]) json.dumps(query_embedding),
to_query(query_text),
*exec_params, similarity,
top_number
])
return embedding_model return embedding_model
def support(self, search_mode: SearchMode): def support(self, search_mode: SearchMode):