refactor: improve logging and status updates in embedding methods

This commit is contained in:
CaptainB 2025-07-18 15:21:01 +08:00
parent c363003aed
commit 6e16c74a5e

View File

@ -6,15 +6,16 @@
@date2023/10/20 14:01
@desc:
"""
import datetime
import os
import threading
import datetime
import traceback
from typing import List
import django.db.models
from django.db.models import QuerySet
from django.db.models.functions import Substr, Reverse
from django.utils.translation import gettext_lazy as _
from langchain_core.embeddings import Embeddings
from common.config.embedding_config import VectorStore
@ -23,10 +24,9 @@ from common.utils.common import get_file_content
from common.utils.lock import RedisLock
from common.utils.logger import maxkb_logger
from common.utils.page_utils import page_desc
from knowledge.models import Paragraph, Status, Document, ProblemParagraphMapping, TaskType, State,SourceType, SearchMode
from knowledge.models import Paragraph, Status, Document, ProblemParagraphMapping, TaskType, State, SourceType, \
SearchMode
from maxkb.conf import (PROJECT_DIR)
from django.utils.translation import gettext_lazy as _
lock = threading.Lock()
@ -91,8 +91,9 @@ class ListenerManagement:
@staticmethod
def embedding_by_paragraph_data_list(data_list, paragraph_id_list, embedding_model: Embeddings):
maxkb_logger.info(_('Start--->Embedding paragraph: {paragraph_id_list}').format(paragraph_id_list=paragraph_id_list))
status = State.SUCCESS
maxkb_logger.info(_('Start--->Embedding paragraph: {paragraph_id_list}').format(
paragraph_id_list=paragraph_id_list)
)
try:
# 删除段落
VectorStore.get_embedding_vector().delete_by_paragraph_ids(paragraph_id_list)
@ -102,14 +103,20 @@ class ListenerManagement:
# 批量向量化
VectorStore.get_embedding_vector().batch_save(data_list, embedding_model, is_save_function)
ListenerManagement.update_status(
QuerySet(Paragraph).filter(id__in=paragraph_id_list), TaskType.EMBEDDING, State.SUCCESS
)
except Exception as e:
maxkb_logger.error(_('Vectorized paragraph: {paragraph_id_list} error {error} {traceback}').format(
paragraph_id_list=paragraph_id_list, error=str(e), traceback=traceback.format_exc()))
status = State.FAILURE
paragraph_id_list=paragraph_id_list, error=str(e), traceback=traceback.format_exc())
)
ListenerManagement.update_status(
QuerySet(Paragraph).filter(id__in=paragraph_id_list), TaskType.EMBEDDING, State.FAILURE
)
finally:
QuerySet(Paragraph).filter(id__in=paragraph_id_list).update(**{'status': status})
maxkb_logger.info(
_('End--->Embedding paragraph: {paragraph_id_list}').format(paragraph_id_list=paragraph_id_list))
maxkb_logger.info(_('End--->Embedding paragraph: {paragraph_id_list}').format(
paragraph_id_list=paragraph_id_list)
)
@staticmethod
def embedding_by_paragraph(paragraph_id, embedding_model: Embeddings):
@ -271,7 +278,6 @@ class ListenerManagement:
ListenerManagement.update_status(QuerySet(Document).filter(id=document_id), TaskType.EMBEDDING,
State.STARTED)
# 根据段落进行向量化处理
page_desc(QuerySet(Paragraph)
.annotate(
@ -381,5 +387,6 @@ class ListenerManagement:
similarity: float,
search_mode: SearchMode,
embedding: Embeddings):
return VectorStore.get_embedding_vector().hit_test(query_text, knowledge_id, exclude_document_id_list, top_number,
return VectorStore.get_embedding_vector().hit_test(query_text, knowledge_id, exclude_document_id_list,
top_number,
similarity, search_mode, embedding)