refactor: improve logging and status updates in embedding methods

This commit is contained in:
CaptainB 2025-07-18 15:21:01 +08:00
parent c363003aed
commit 6e16c74a5e

View File

@ -6,15 +6,16 @@
@date2023/10/20 14:01 @date2023/10/20 14:01
@desc: @desc:
""" """
import datetime
import os import os
import threading import threading
import datetime
import traceback import traceback
from typing import List from typing import List
import django.db.models import django.db.models
from django.db.models import QuerySet from django.db.models import QuerySet
from django.db.models.functions import Substr, Reverse from django.db.models.functions import Substr, Reverse
from django.utils.translation import gettext_lazy as _
from langchain_core.embeddings import Embeddings from langchain_core.embeddings import Embeddings
from common.config.embedding_config import VectorStore from common.config.embedding_config import VectorStore
@ -23,10 +24,9 @@ from common.utils.common import get_file_content
from common.utils.lock import RedisLock from common.utils.lock import RedisLock
from common.utils.logger import maxkb_logger from common.utils.logger import maxkb_logger
from common.utils.page_utils import page_desc from common.utils.page_utils import page_desc
from knowledge.models import Paragraph, Status, Document, ProblemParagraphMapping, TaskType, State,SourceType, SearchMode from knowledge.models import Paragraph, Status, Document, ProblemParagraphMapping, TaskType, State, SourceType, \
SearchMode
from maxkb.conf import (PROJECT_DIR) from maxkb.conf import (PROJECT_DIR)
from django.utils.translation import gettext_lazy as _
lock = threading.Lock() lock = threading.Lock()
@ -91,8 +91,9 @@ class ListenerManagement:
@staticmethod @staticmethod
def embedding_by_paragraph_data_list(data_list, paragraph_id_list, embedding_model: Embeddings): def embedding_by_paragraph_data_list(data_list, paragraph_id_list, embedding_model: Embeddings):
maxkb_logger.info(_('Start--->Embedding paragraph: {paragraph_id_list}').format(paragraph_id_list=paragraph_id_list)) maxkb_logger.info(_('Start--->Embedding paragraph: {paragraph_id_list}').format(
status = State.SUCCESS paragraph_id_list=paragraph_id_list)
)
try: try:
# 删除段落 # 删除段落
VectorStore.get_embedding_vector().delete_by_paragraph_ids(paragraph_id_list) VectorStore.get_embedding_vector().delete_by_paragraph_ids(paragraph_id_list)
@ -102,14 +103,20 @@ class ListenerManagement:
# 批量向量化 # 批量向量化
VectorStore.get_embedding_vector().batch_save(data_list, embedding_model, is_save_function) VectorStore.get_embedding_vector().batch_save(data_list, embedding_model, is_save_function)
ListenerManagement.update_status(
QuerySet(Paragraph).filter(id__in=paragraph_id_list), TaskType.EMBEDDING, State.SUCCESS
)
except Exception as e: except Exception as e:
maxkb_logger.error(_('Vectorized paragraph: {paragraph_id_list} error {error} {traceback}').format( maxkb_logger.error(_('Vectorized paragraph: {paragraph_id_list} error {error} {traceback}').format(
paragraph_id_list=paragraph_id_list, error=str(e), traceback=traceback.format_exc())) paragraph_id_list=paragraph_id_list, error=str(e), traceback=traceback.format_exc())
status = State.FAILURE )
ListenerManagement.update_status(
QuerySet(Paragraph).filter(id__in=paragraph_id_list), TaskType.EMBEDDING, State.FAILURE
)
finally: finally:
QuerySet(Paragraph).filter(id__in=paragraph_id_list).update(**{'status': status}) maxkb_logger.info(_('End--->Embedding paragraph: {paragraph_id_list}').format(
maxkb_logger.info( paragraph_id_list=paragraph_id_list)
_('End--->Embedding paragraph: {paragraph_id_list}').format(paragraph_id_list=paragraph_id_list)) )
@staticmethod @staticmethod
def embedding_by_paragraph(paragraph_id, embedding_model: Embeddings): def embedding_by_paragraph(paragraph_id, embedding_model: Embeddings):
@ -271,7 +278,6 @@ class ListenerManagement:
ListenerManagement.update_status(QuerySet(Document).filter(id=document_id), TaskType.EMBEDDING, ListenerManagement.update_status(QuerySet(Document).filter(id=document_id), TaskType.EMBEDDING,
State.STARTED) State.STARTED)
# 根据段落进行向量化处理 # 根据段落进行向量化处理
page_desc(QuerySet(Paragraph) page_desc(QuerySet(Paragraph)
.annotate( .annotate(
@ -381,5 +387,6 @@ class ListenerManagement:
similarity: float, similarity: float,
search_mode: SearchMode, search_mode: SearchMode,
embedding: Embeddings): embedding: Embeddings):
return VectorStore.get_embedding_vector().hit_test(query_text, knowledge_id, exclude_document_id_list, top_number, return VectorStore.get_embedding_vector().hit_test(query_text, knowledge_id, exclude_document_id_list,
top_number,
similarity, search_mode, embedding) similarity, search_mode, embedding)