feat: enhance Document API with create and query functionalities
This commit is contained in:
parent
ba5028858c
commit
24e734fb36
@ -4,29 +4,7 @@ from drf_spectacular.utils import OpenApiParameter
|
|||||||
from common.mixins.api_mixin import APIMixin
|
from common.mixins.api_mixin import APIMixin
|
||||||
from common.result import DefaultResultSerializer
|
from common.result import DefaultResultSerializer
|
||||||
from knowledge.serializers.common import BatchSerializer
|
from knowledge.serializers.common import BatchSerializer
|
||||||
from knowledge.serializers.document import DocumentCreateRequest, DocumentInstanceSerializer
|
from knowledge.serializers.document import DocumentInstanceSerializer
|
||||||
|
|
||||||
|
|
||||||
class DocumentCreateAPI(APIMixin):
|
|
||||||
@staticmethod
|
|
||||||
def get_parameters():
|
|
||||||
return [
|
|
||||||
OpenApiParameter(
|
|
||||||
name="workspace_id",
|
|
||||||
description="工作空间id",
|
|
||||||
type=OpenApiTypes.STR,
|
|
||||||
location='path',
|
|
||||||
required=True,
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_request():
|
|
||||||
return DocumentCreateRequest
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_response():
|
|
||||||
return DefaultResultSerializer
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentSplitAPI(APIMixin):
|
class DocumentSplitAPI(APIMixin):
|
||||||
@ -127,3 +105,74 @@ class DocumentBatchCreateAPI(APIMixin):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def get_response():
|
def get_response():
|
||||||
return DefaultResultSerializer
|
return DefaultResultSerializer
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentCreateAPI(APIMixin):
|
||||||
|
@staticmethod
|
||||||
|
def get_parameters():
|
||||||
|
return [
|
||||||
|
OpenApiParameter(
|
||||||
|
name="workspace_id",
|
||||||
|
description="工作空间id",
|
||||||
|
type=OpenApiTypes.STR,
|
||||||
|
location='path',
|
||||||
|
required=True,
|
||||||
|
),
|
||||||
|
OpenApiParameter(
|
||||||
|
name="knowledge_id",
|
||||||
|
description="知识库id",
|
||||||
|
type=OpenApiTypes.STR,
|
||||||
|
location='path',
|
||||||
|
required=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_request():
|
||||||
|
return DocumentInstanceSerializer
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_response():
|
||||||
|
return DefaultResultSerializer
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentReadAPI(APIMixin):
|
||||||
|
@staticmethod
|
||||||
|
def get_parameters():
|
||||||
|
return [
|
||||||
|
OpenApiParameter(
|
||||||
|
name="workspace_id",
|
||||||
|
description="工作空间id",
|
||||||
|
type=OpenApiTypes.STR,
|
||||||
|
location='path',
|
||||||
|
required=True,
|
||||||
|
),
|
||||||
|
OpenApiParameter(
|
||||||
|
name="knowledge_id",
|
||||||
|
description="知识库id",
|
||||||
|
type=OpenApiTypes.STR,
|
||||||
|
location='path',
|
||||||
|
required=True,
|
||||||
|
),
|
||||||
|
OpenApiParameter(
|
||||||
|
name="document_id",
|
||||||
|
description="文档id",
|
||||||
|
type=OpenApiTypes.STR,
|
||||||
|
location='path',
|
||||||
|
required=True,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_response():
|
||||||
|
return DefaultResultSerializer
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentEditAPI(DocumentReadAPI):
|
||||||
|
@staticmethod
|
||||||
|
def get_request():
|
||||||
|
return DocumentInstanceSerializer
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentDeleteAPI(DocumentReadAPI):
|
||||||
|
pass
|
||||||
|
|||||||
@ -1,16 +1,18 @@
|
|||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import traceback
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
|
||||||
import uuid_utils.compat as uuid
|
import uuid_utils.compat as uuid
|
||||||
from celery_once import AlreadyQueued
|
from celery_once import AlreadyQueued
|
||||||
from django.db import transaction
|
from django.db import transaction, models
|
||||||
from django.db.models import QuerySet, Model
|
from django.db.models import QuerySet, Model
|
||||||
from django.db.models.functions import Substr, Reverse
|
from django.db.models.functions import Substr, Reverse
|
||||||
from django.utils.translation import gettext_lazy as _
|
from django.utils.translation import gettext_lazy as _
|
||||||
from rest_framework import serializers
|
from rest_framework import serializers
|
||||||
|
|
||||||
from common.db.search import native_search
|
from common.db.search import native_search, get_dynamics_model, native_page_search
|
||||||
from common.event import ListenerManagement
|
from common.event import ListenerManagement
|
||||||
from common.event.common import work_thread_pool
|
from common.event.common import work_thread_pool
|
||||||
from common.exception.app_exception import AppApiException
|
from common.exception.app_exception import AppApiException
|
||||||
@ -23,12 +25,15 @@ from common.handle.impl.text.xls_split_handle import XlsSplitHandle
|
|||||||
from common.handle.impl.text.xlsx_split_handle import XlsxSplitHandle
|
from common.handle.impl.text.xlsx_split_handle import XlsxSplitHandle
|
||||||
from common.handle.impl.text.zip_split_handle import ZipSplitHandle
|
from common.handle.impl.text.zip_split_handle import ZipSplitHandle
|
||||||
from common.utils.common import post, get_file_content, bulk_create_in_batches
|
from common.utils.common import post, get_file_content, bulk_create_in_batches
|
||||||
|
from common.utils.fork import Fork
|
||||||
|
from common.utils.split_model import get_split_model
|
||||||
from knowledge.models import Knowledge, Paragraph, Problem, Document, KnowledgeType, ProblemParagraphMapping, State, \
|
from knowledge.models import Knowledge, Paragraph, Problem, Document, KnowledgeType, ProblemParagraphMapping, State, \
|
||||||
TaskType, File
|
TaskType, File
|
||||||
from knowledge.serializers.common import ProblemParagraphManage, BatchSerializer
|
from knowledge.serializers.common import ProblemParagraphManage, BatchSerializer, get_embedding_model_id_by_knowledge_id
|
||||||
from knowledge.serializers.paragraph import ParagraphSerializers, ParagraphInstanceSerializer, \
|
from knowledge.serializers.paragraph import ParagraphSerializers, ParagraphInstanceSerializer, \
|
||||||
delete_problems_and_mappings
|
delete_problems_and_mappings
|
||||||
from knowledge.task import embedding_by_document, delete_embedding_by_document_list
|
from knowledge.task.embedding import embedding_by_document, delete_embedding_by_document_list, \
|
||||||
|
delete_embedding_by_document
|
||||||
from maxkb.const import PROJECT_DIR
|
from maxkb.const import PROJECT_DIR
|
||||||
|
|
||||||
default_split_handle = TextSplitHandle()
|
default_split_handle = TextSplitHandle()
|
||||||
@ -62,13 +67,6 @@ class DocumentInstanceSerializer(serializers.Serializer):
|
|||||||
paragraphs = ParagraphInstanceSerializer(required=False, many=True, allow_null=True)
|
paragraphs = ParagraphInstanceSerializer(required=False, many=True, allow_null=True)
|
||||||
|
|
||||||
|
|
||||||
class DocumentCreateRequest(serializers.Serializer):
|
|
||||||
name = serializers.CharField(required=True, label=_('knowledge name'), max_length=64, min_length=1)
|
|
||||||
desc = serializers.CharField(required=True, label=_('knowledge description'), max_length=256, min_length=1)
|
|
||||||
embedding_model_id = serializers.UUIDField(required=True, label=_('embedding model'))
|
|
||||||
documents = DocumentInstanceSerializer(required=False, many=True)
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentSplitRequest(serializers.Serializer):
|
class DocumentSplitRequest(serializers.Serializer):
|
||||||
file = serializers.ListField(required=True, label=_('file list'))
|
file = serializers.ListField(required=True, label=_('file list'))
|
||||||
limit = serializers.IntegerField(required=False, label=_('limit'))
|
limit = serializers.IntegerField(required=False, label=_('limit'))
|
||||||
@ -80,18 +78,153 @@ class DocumentSplitRequest(serializers.Serializer):
|
|||||||
with_filter = serializers.BooleanField(required=False, label=_('Auto Clean'))
|
with_filter = serializers.BooleanField(required=False, label=_('Auto Clean'))
|
||||||
|
|
||||||
|
|
||||||
class DocumentBatchRequest(serializers.Serializer):
|
|
||||||
file = serializers.ListField(required=True, label=_('file list'))
|
|
||||||
limit = serializers.IntegerField(required=False, label=_('limit'))
|
|
||||||
patterns = serializers.ListField(
|
|
||||||
required=False,
|
|
||||||
child=serializers.CharField(required=True, label=_('patterns')),
|
|
||||||
label=_('patterns')
|
|
||||||
)
|
|
||||||
with_filter = serializers.BooleanField(required=False, label=_('Auto Clean'))
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentSerializers(serializers.Serializer):
|
class DocumentSerializers(serializers.Serializer):
|
||||||
|
class Query(serializers.Serializer):
|
||||||
|
# 知识库id
|
||||||
|
workspace_id = serializers.CharField(required=True, label=_('workspace id'))
|
||||||
|
knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id'))
|
||||||
|
name = serializers.CharField(required=False, max_length=128, min_length=1, label=_('document name'))
|
||||||
|
hit_handling_method = serializers.CharField(required=False, label=_('hit handling method'))
|
||||||
|
is_active = serializers.BooleanField(required=False, label=_('document is active'))
|
||||||
|
task_type = serializers.IntegerField(required=False, label=_('task type'))
|
||||||
|
status = serializers.CharField(required=False, label=_('status'))
|
||||||
|
order_by = serializers.CharField(required=False, label=_('order by'))
|
||||||
|
|
||||||
|
def get_query_set(self):
|
||||||
|
query_set = QuerySet(model=Document)
|
||||||
|
query_set = query_set.filter(**{'knowledge_id': self.data.get("knowledge_id")})
|
||||||
|
if 'name' in self.data and self.data.get('name') is not None:
|
||||||
|
query_set = query_set.filter(**{'name__icontains': self.data.get('name')})
|
||||||
|
if 'hit_handling_method' in self.data and self.data.get('hit_handling_method') is not None:
|
||||||
|
query_set = query_set.filter(**{'hit_handling_method': self.data.get('hit_handling_method')})
|
||||||
|
if 'is_active' in self.data and self.data.get('is_active') is not None:
|
||||||
|
query_set = query_set.filter(**{'is_active': self.data.get('is_active')})
|
||||||
|
if 'status' in self.data and self.data.get(
|
||||||
|
'status') is not None:
|
||||||
|
task_type = self.data.get('task_type')
|
||||||
|
status = self.data.get(
|
||||||
|
'status')
|
||||||
|
if task_type is not None:
|
||||||
|
query_set = query_set.annotate(
|
||||||
|
reversed_status=Reverse('status'),
|
||||||
|
task_type_status=Substr('reversed_status', TaskType(task_type).value,
|
||||||
|
1),
|
||||||
|
).filter(task_type_status=State(status).value).values('id')
|
||||||
|
else:
|
||||||
|
if status != State.SUCCESS.value:
|
||||||
|
query_set = query_set.filter(status__icontains=status)
|
||||||
|
else:
|
||||||
|
query_set = query_set.filter(status__iregex='^[2n]*$')
|
||||||
|
order_by = self.data.get('order_by', '')
|
||||||
|
order_by_query_set = QuerySet(model=get_dynamics_model(
|
||||||
|
{'char_length': models.CharField(), 'paragraph_count': models.IntegerField(),
|
||||||
|
"update_time": models.IntegerField(), 'create_time': models.DateTimeField()}))
|
||||||
|
if order_by:
|
||||||
|
order_by_query_set = order_by_query_set.order_by(order_by)
|
||||||
|
else:
|
||||||
|
order_by_query_set = order_by_query_set.order_by('-create_time', 'id')
|
||||||
|
return {
|
||||||
|
'document_custom_sql': query_set,
|
||||||
|
'order_by_query': order_by_query_set
|
||||||
|
}
|
||||||
|
|
||||||
|
def list(self, with_valid=False):
|
||||||
|
if with_valid:
|
||||||
|
self.is_valid(raise_exception=True)
|
||||||
|
query_set = self.get_query_set()
|
||||||
|
return native_search(query_set, select_string=get_file_content(
|
||||||
|
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_document.sql')))
|
||||||
|
|
||||||
|
def page(self, current_page, page_size):
|
||||||
|
query_set = self.get_query_set()
|
||||||
|
return native_page_search(current_page, page_size, query_set, select_string=get_file_content(
|
||||||
|
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_document.sql')))
|
||||||
|
|
||||||
|
class Sync(serializers.Serializer):
|
||||||
|
document_id = serializers.UUIDField(required=True, label=_('document id'))
|
||||||
|
|
||||||
|
def is_valid(self, *, raise_exception=False):
|
||||||
|
super().is_valid(raise_exception=True)
|
||||||
|
document_id = self.data.get('document_id')
|
||||||
|
first = QuerySet(Document).filter(id=document_id).first()
|
||||||
|
if first is None:
|
||||||
|
raise AppApiException(500, _('document id not exist'))
|
||||||
|
if first.type != KnowledgeType.WEB:
|
||||||
|
raise AppApiException(500, _('Synchronization is only supported for web site types'))
|
||||||
|
|
||||||
|
def sync(self, with_valid=True, with_embedding=True):
|
||||||
|
if with_valid:
|
||||||
|
self.is_valid(raise_exception=True)
|
||||||
|
document_id = self.data.get('document_id')
|
||||||
|
document = QuerySet(Document).filter(id=document_id).first()
|
||||||
|
state = State.SUCCESS
|
||||||
|
if document.type != KnowledgeType.WEB:
|
||||||
|
return True
|
||||||
|
try:
|
||||||
|
ListenerManagement.update_status(QuerySet(Document).filter(id=document_id),
|
||||||
|
TaskType.SYNC,
|
||||||
|
State.PENDING)
|
||||||
|
ListenerManagement.get_aggregation_document_status(document_id)()
|
||||||
|
source_url = document.meta.get('source_url')
|
||||||
|
selector_list = document.meta.get('selector').split(
|
||||||
|
" ") if 'selector' in document.meta and document.meta.get('selector') is not None else []
|
||||||
|
result = Fork(source_url, selector_list).fork()
|
||||||
|
if result.status == 200:
|
||||||
|
# 删除段落
|
||||||
|
QuerySet(model=Paragraph).filter(document_id=document_id).delete()
|
||||||
|
# 删除问题
|
||||||
|
QuerySet(model=ProblemParagraphMapping).filter(document_id=document_id).delete()
|
||||||
|
delete_problems_and_mappings([document_id])
|
||||||
|
# 删除向量库
|
||||||
|
delete_embedding_by_document(document_id)
|
||||||
|
paragraphs = get_split_model('web.md').parse(result.content)
|
||||||
|
char_length = reduce(lambda x, y: x + y,
|
||||||
|
[len(p.get('content')) for p in paragraphs],
|
||||||
|
0)
|
||||||
|
QuerySet(Document).filter(id=document_id).update(char_length=char_length)
|
||||||
|
document_paragraph_model = DocumentSerializers.Create.get_paragraph_model(document, paragraphs)
|
||||||
|
|
||||||
|
paragraph_model_list = document_paragraph_model.get('paragraph_model_list')
|
||||||
|
problem_paragraph_object_list = document_paragraph_model.get('problem_paragraph_object_list')
|
||||||
|
problem_model_list, problem_paragraph_mapping_list = ProblemParagraphManage(
|
||||||
|
problem_paragraph_object_list, document.knowledge_id).to_problem_model_list()
|
||||||
|
# 批量插入段落
|
||||||
|
QuerySet(Paragraph).bulk_create(paragraph_model_list) if len(paragraph_model_list) > 0 else None
|
||||||
|
# 批量插入问题
|
||||||
|
QuerySet(Problem).bulk_create(problem_model_list) if len(problem_model_list) > 0 else None
|
||||||
|
# 插入关联问题
|
||||||
|
QuerySet(ProblemParagraphMapping).bulk_create(problem_paragraph_mapping_list) if len(
|
||||||
|
problem_paragraph_mapping_list) > 0 else None
|
||||||
|
# 向量化
|
||||||
|
if with_embedding:
|
||||||
|
embedding_model_id = get_embedding_model_id_by_knowledge_id(document.knowledge_id)
|
||||||
|
ListenerManagement.update_status(QuerySet(Document).filter(id=document_id),
|
||||||
|
TaskType.EMBEDDING,
|
||||||
|
State.PENDING)
|
||||||
|
ListenerManagement.update_status(QuerySet(Paragraph).filter(document_id=document_id),
|
||||||
|
TaskType.EMBEDDING,
|
||||||
|
State.PENDING)
|
||||||
|
ListenerManagement.get_aggregation_document_status(document_id)()
|
||||||
|
embedding_by_document.delay(document_id, embedding_model_id)
|
||||||
|
|
||||||
|
else:
|
||||||
|
state = State.FAILURE
|
||||||
|
except Exception as e:
|
||||||
|
logging.getLogger("max_kb_error").error(f'{str(e)}:{traceback.format_exc()}')
|
||||||
|
state = State.FAILURE
|
||||||
|
ListenerManagement.update_status(
|
||||||
|
QuerySet(Document).filter(id=document_id),
|
||||||
|
TaskType.SYNC,
|
||||||
|
state
|
||||||
|
)
|
||||||
|
ListenerManagement.update_status(
|
||||||
|
QuerySet(Paragraph).filter(document_id=document_id),
|
||||||
|
TaskType.SYNC,
|
||||||
|
state
|
||||||
|
)
|
||||||
|
ListenerManagement.get_aggregation_document_status(document_id)()
|
||||||
|
return True
|
||||||
|
|
||||||
class Operate(serializers.Serializer):
|
class Operate(serializers.Serializer):
|
||||||
document_id = serializers.UUIDField(required=True, label=_('document id'))
|
document_id = serializers.UUIDField(required=True, label=_('document id'))
|
||||||
knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id'))
|
knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id'))
|
||||||
@ -148,6 +281,7 @@ class DocumentSerializers(serializers.Serializer):
|
|||||||
raise AppApiException(500, _('The task is being executed, please do not send it repeatedly.'))
|
raise AppApiException(500, _('The task is being executed, please do not send it repeatedly.'))
|
||||||
|
|
||||||
class Create(serializers.Serializer):
|
class Create(serializers.Serializer):
|
||||||
|
workspace_id = serializers.UUIDField(required=True, label=_('workspace id'))
|
||||||
knowledge_id = serializers.UUIDField(required=True, label=_('document id'))
|
knowledge_id = serializers.UUIDField(required=True, label=_('document id'))
|
||||||
|
|
||||||
def is_valid(self, *, raise_exception=False):
|
def is_valid(self, *, raise_exception=False):
|
||||||
@ -166,7 +300,7 @@ class DocumentSerializers(serializers.Serializer):
|
|||||||
@transaction.atomic
|
@transaction.atomic
|
||||||
def save(self, instance: Dict, with_valid=False, **kwargs):
|
def save(self, instance: Dict, with_valid=False, **kwargs):
|
||||||
if with_valid:
|
if with_valid:
|
||||||
DocumentCreateRequest(data=instance).is_valid(raise_exception=True)
|
DocumentInstanceSerializer(data=instance).is_valid(raise_exception=True)
|
||||||
self.is_valid(raise_exception=True)
|
self.is_valid(raise_exception=True)
|
||||||
knowledge_id = self.data.get('knowledge_id')
|
knowledge_id = self.data.get('knowledge_id')
|
||||||
document_paragraph_model = self.get_document_paragraph_model(knowledge_id, instance)
|
document_paragraph_model = self.get_document_paragraph_model(knowledge_id, instance)
|
||||||
|
|||||||
@ -16,7 +16,8 @@ from knowledge.models import Knowledge, KnowledgeScope, KnowledgeType, Document,
|
|||||||
ProblemParagraphMapping, ApplicationKnowledgeMapping
|
ProblemParagraphMapping, ApplicationKnowledgeMapping
|
||||||
from knowledge.serializers.common import ProblemParagraphManage, get_embedding_model_id_by_knowledge_id, MetaSerializer
|
from knowledge.serializers.common import ProblemParagraphManage, get_embedding_model_id_by_knowledge_id, MetaSerializer
|
||||||
from knowledge.serializers.document import DocumentSerializers
|
from knowledge.serializers.document import DocumentSerializers
|
||||||
from knowledge.task import sync_web_knowledge, embedding_by_knowledge, delete_embedding_by_knowledge
|
from knowledge.task.embedding import embedding_by_knowledge, delete_embedding_by_knowledge
|
||||||
|
from knowledge.task.sync import sync_web_knowledge
|
||||||
from maxkb.conf import PROJECT_DIR
|
from maxkb.conf import PROJECT_DIR
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -10,12 +10,13 @@ from rest_framework import serializers
|
|||||||
|
|
||||||
from common.exception.app_exception import AppApiException
|
from common.exception.app_exception import AppApiException
|
||||||
from common.utils.common import post
|
from common.utils.common import post
|
||||||
from knowledge.models import Paragraph, Problem, Document, ProblemParagraphMapping
|
from knowledge.models import Paragraph, Problem, Document, ProblemParagraphMapping, SourceType
|
||||||
from knowledge.serializers.common import ProblemParagraphObject, ProblemParagraphManage, \
|
from knowledge.serializers.common import ProblemParagraphObject, ProblemParagraphManage, \
|
||||||
get_embedding_model_id_by_knowledge_id, update_document_char_length
|
get_embedding_model_id_by_knowledge_id, update_document_char_length
|
||||||
from knowledge.serializers.problem import ProblemInstanceSerializer, ProblemSerializer
|
from knowledge.serializers.problem import ProblemInstanceSerializer, ProblemSerializer, ProblemSerializers
|
||||||
from knowledge.task import embedding_by_paragraph, enable_embedding_by_paragraph, disable_embedding_by_paragraph, \
|
from knowledge.task.embedding import embedding_by_paragraph, enable_embedding_by_paragraph, \
|
||||||
delete_embedding_by_paragraph
|
disable_embedding_by_paragraph, \
|
||||||
|
delete_embedding_by_paragraph, embedding_by_problem as embedding_by_problem_task
|
||||||
|
|
||||||
|
|
||||||
class ParagraphSerializer(serializers.ModelSerializer):
|
class ParagraphSerializer(serializers.ModelSerializer):
|
||||||
@ -49,6 +50,70 @@ class ParagraphSerializers(serializers.Serializer):
|
|||||||
allow_blank=True)
|
allow_blank=True)
|
||||||
content = serializers.CharField(required=True, max_length=102400, label=_('section title'))
|
content = serializers.CharField(required=True, max_length=102400, label=_('section title'))
|
||||||
|
|
||||||
|
class Problem(serializers.Serializer):
|
||||||
|
knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id'))
|
||||||
|
document_id = serializers.UUIDField(required=True, label=_('document id'))
|
||||||
|
paragraph_id = serializers.UUIDField(required=True, label=_('paragraph id'))
|
||||||
|
|
||||||
|
def is_valid(self, *, raise_exception=False):
|
||||||
|
super().is_valid(raise_exception=True)
|
||||||
|
if not QuerySet(Paragraph).filter(id=self.data.get('paragraph_id')).exists():
|
||||||
|
raise AppApiException(500, _('Paragraph id does not exist'))
|
||||||
|
|
||||||
|
def list(self, with_valid=False):
|
||||||
|
"""
|
||||||
|
获取问题列表
|
||||||
|
:param with_valid: 是否校验
|
||||||
|
:return: 问题列表
|
||||||
|
"""
|
||||||
|
if with_valid:
|
||||||
|
self.is_valid(raise_exception=True)
|
||||||
|
problem_paragraph_mapping = QuerySet(ProblemParagraphMapping).filter(
|
||||||
|
knowledge_id=self.data.get("knowledge_id"),
|
||||||
|
paragraph_id=self.data.get(
|
||||||
|
'paragraph_id'))
|
||||||
|
return [ProblemSerializer(row).data for row in
|
||||||
|
QuerySet(Problem).filter(id__in=[row.problem_id for row in problem_paragraph_mapping])]
|
||||||
|
|
||||||
|
@transaction.atomic
|
||||||
|
def save(self, instance: Dict, with_valid=True, with_embedding=True, embedding_by_problem=None):
|
||||||
|
if with_valid:
|
||||||
|
self.is_valid()
|
||||||
|
ProblemInstanceSerializer(data=instance).is_valid(raise_exception=True)
|
||||||
|
problem = QuerySet(Problem).filter(knowledge_id=self.data.get('knowledge_id'),
|
||||||
|
content=instance.get('content')).first()
|
||||||
|
if problem is None:
|
||||||
|
problem = Problem(id=uuid.uuid7(), knowledge_id=self.data.get('knowledge_id'),
|
||||||
|
content=instance.get('content'))
|
||||||
|
problem.save()
|
||||||
|
if QuerySet(ProblemParagraphMapping).filter(knowledge_id=self.data.get('knowledge_id'),
|
||||||
|
problem_id=problem.id,
|
||||||
|
paragraph_id=self.data.get('paragraph_id')).exists():
|
||||||
|
raise AppApiException(500, _('Already associated, please do not associate again'))
|
||||||
|
problem_paragraph_mapping = ProblemParagraphMapping(
|
||||||
|
id=uuid.uuid7(),
|
||||||
|
problem_id=problem.id,
|
||||||
|
document_id=self.data.get('document_id'),
|
||||||
|
paragraph_id=self.data.get('paragraph_id'),
|
||||||
|
knowledge_id=self.data.get('knowledge_id')
|
||||||
|
)
|
||||||
|
problem_paragraph_mapping.save()
|
||||||
|
model_id = get_embedding_model_id_by_knowledge_id(self.data.get('knowledge_id'))
|
||||||
|
if with_embedding:
|
||||||
|
embedding_by_problem_task({
|
||||||
|
'text': problem.content,
|
||||||
|
'is_active': True,
|
||||||
|
'source_type': SourceType.PROBLEM,
|
||||||
|
'source_id': problem_paragraph_mapping.id,
|
||||||
|
'document_id': self.data.get('document_id'),
|
||||||
|
'paragraph_id': self.data.get('paragraph_id'),
|
||||||
|
'knowledge_id': self.data.get('knowledge_id'),
|
||||||
|
}, model_id)
|
||||||
|
|
||||||
|
return ProblemSerializers.Operate(
|
||||||
|
data={'knowledge_id': self.data.get('knowledge_id'), 'problem_id': problem.id}
|
||||||
|
).one(with_valid=True)
|
||||||
|
|
||||||
class Operate(serializers.Serializer):
|
class Operate(serializers.Serializer):
|
||||||
# 段落id
|
# 段落id
|
||||||
paragraph_id = serializers.UUIDField(required=True, label=_('paragraph id'))
|
paragraph_id = serializers.UUIDField(required=True, label=_('paragraph id'))
|
||||||
|
|||||||
@ -1,7 +1,17 @@
|
|||||||
|
import os
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
from django.db import transaction
|
||||||
|
from django.db.models import QuerySet
|
||||||
from django.utils.translation import gettext_lazy as _
|
from django.utils.translation import gettext_lazy as _
|
||||||
from rest_framework import serializers
|
from rest_framework import serializers
|
||||||
|
|
||||||
from knowledge.models import Problem
|
from common.db.search import native_search
|
||||||
|
from common.utils.common import get_file_content
|
||||||
|
from knowledge.models import Problem, ProblemParagraphMapping, Paragraph, Knowledge
|
||||||
|
from knowledge.serializers.common import get_embedding_model_id_by_knowledge_id
|
||||||
|
from knowledge.task.embedding import delete_embedding_by_source_ids, update_problem_embedding
|
||||||
|
from maxkb.const import PROJECT_DIR
|
||||||
|
|
||||||
|
|
||||||
class ProblemSerializer(serializers.ModelSerializer):
|
class ProblemSerializer(serializers.ModelSerializer):
|
||||||
@ -13,3 +23,55 @@ class ProblemSerializer(serializers.ModelSerializer):
|
|||||||
class ProblemInstanceSerializer(serializers.Serializer):
|
class ProblemInstanceSerializer(serializers.Serializer):
|
||||||
id = serializers.CharField(required=False, label=_('problem id'))
|
id = serializers.CharField(required=False, label=_('problem id'))
|
||||||
content = serializers.CharField(required=True, max_length=256, label=_('content'))
|
content = serializers.CharField(required=True, max_length=256, label=_('content'))
|
||||||
|
|
||||||
|
|
||||||
|
class ProblemSerializers(serializers.Serializer):
|
||||||
|
class Operate(serializers.Serializer):
|
||||||
|
knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id'))
|
||||||
|
problem_id = serializers.UUIDField(required=True, label=_('problem id'))
|
||||||
|
|
||||||
|
def list_paragraph(self, with_valid=True):
|
||||||
|
if with_valid:
|
||||||
|
self.is_valid(raise_exception=True)
|
||||||
|
problem_paragraph_mapping = QuerySet(ProblemParagraphMapping).filter(
|
||||||
|
knowledge_id=self.data.get("knowledge_id"),
|
||||||
|
problem_id=self.data.get("problem_id")
|
||||||
|
)
|
||||||
|
if problem_paragraph_mapping is None or len(problem_paragraph_mapping) == 0:
|
||||||
|
return []
|
||||||
|
return native_search(
|
||||||
|
QuerySet(Paragraph).filter(id__in=[row.paragraph_id for row in problem_paragraph_mapping]),
|
||||||
|
select_string=get_file_content(
|
||||||
|
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_paragraph.sql')))
|
||||||
|
|
||||||
|
def one(self, with_valid=True):
|
||||||
|
if with_valid:
|
||||||
|
self.is_valid(raise_exception=True)
|
||||||
|
return ProblemInstanceSerializer(QuerySet(Problem).get(**{'id': self.data.get('problem_id')})).data
|
||||||
|
|
||||||
|
@transaction.atomic
|
||||||
|
def delete(self, with_valid=True):
|
||||||
|
if with_valid:
|
||||||
|
self.is_valid(raise_exception=True)
|
||||||
|
problem_paragraph_mapping_list = QuerySet(ProblemParagraphMapping).filter(
|
||||||
|
knowledge_id=self.data.get('knowledge_id'),
|
||||||
|
problem_id=self.data.get('problem_id'))
|
||||||
|
source_ids = [row.id for row in problem_paragraph_mapping_list]
|
||||||
|
problem_paragraph_mapping_list.delete()
|
||||||
|
QuerySet(Problem).filter(id=self.data.get('problem_id')).delete()
|
||||||
|
delete_embedding_by_source_ids(source_ids)
|
||||||
|
return True
|
||||||
|
|
||||||
|
@transaction.atomic
|
||||||
|
def edit(self, instance: Dict, with_valid=True):
|
||||||
|
if with_valid:
|
||||||
|
self.is_valid(raise_exception=True)
|
||||||
|
problem_id = self.data.get('problem_id')
|
||||||
|
knowledge_id = self.data.get('knowledge_id')
|
||||||
|
content = instance.get('content')
|
||||||
|
problem = QuerySet(Problem).filter(id=problem_id, knowledge_id=knowledge_id).first()
|
||||||
|
QuerySet(Knowledge).filter(id=knowledge_id)
|
||||||
|
problem.content = content
|
||||||
|
problem.save()
|
||||||
|
model_id = get_embedding_model_id_by_knowledge_id(knowledge_id)
|
||||||
|
update_problem_embedding(problem_id, content, model_id)
|
||||||
|
|||||||
@ -1,2 +0,0 @@
|
|||||||
from .sync import *
|
|
||||||
from .embedding import *
|
|
||||||
@ -11,24 +11,28 @@ from django.utils.translation import gettext_lazy as _
|
|||||||
from common.utils.fork import ChildLink, Fork
|
from common.utils.fork import ChildLink, Fork
|
||||||
from common.utils.split_model import get_split_model
|
from common.utils.split_model import get_split_model
|
||||||
from knowledge.models.knowledge import KnowledgeType, Document, Knowledge, Status
|
from knowledge.models.knowledge import KnowledgeType, Document, Knowledge, Status
|
||||||
|
from knowledge.serializers.document import DocumentSerializers
|
||||||
|
from knowledge.serializers.paragraph import ParagraphSerializers
|
||||||
|
|
||||||
max_kb_error = logging.getLogger("max_kb_error")
|
max_kb_error = logging.getLogger("max_kb_error")
|
||||||
max_kb = logging.getLogger("max_kb")
|
max_kb = logging.getLogger("max_kb")
|
||||||
|
|
||||||
|
|
||||||
def get_save_handler(knowledge_id, selector):
|
def get_save_handler(knowledge_id, selector):
|
||||||
from knowledge.serializers import DocumentSerializers
|
|
||||||
|
|
||||||
def handler(child_link: ChildLink, response: Fork.Response):
|
def handler(child_link: ChildLink, response: Fork.Response):
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
try:
|
try:
|
||||||
document_name = child_link.tag.text if child_link.tag is not None and len(
|
document_name = child_link.tag.text if child_link.tag is not None and len(
|
||||||
child_link.tag.text.strip()) > 0 else child_link.url
|
child_link.tag.text.strip()) > 0 else child_link.url
|
||||||
paragraphs = get_split_model('web.md').parse(response.content)
|
paragraphs = get_split_model('web.md').parse(response.content)
|
||||||
DocumentSerializers.Create(data={'knowledge_id': knowledge_id}).save(
|
DocumentSerializers.Create(
|
||||||
{'name': document_name, 'paragraphs': paragraphs,
|
data={'knowledge_id': knowledge_id}
|
||||||
'meta': {'source_url': child_link.url, 'selector': selector},
|
).save({
|
||||||
'type': KnowledgeType.WEB}, with_valid=True)
|
'name': document_name,
|
||||||
|
'paragraphs': paragraphs,
|
||||||
|
'meta': {'source_url': child_link.url, 'selector': selector},
|
||||||
|
'type': KnowledgeType.WEB
|
||||||
|
}, with_valid=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.getLogger("max_kb_error").error(f'{str(e)}:{traceback.format_exc()}')
|
logging.getLogger("max_kb_error").error(f'{str(e)}:{traceback.format_exc()}')
|
||||||
|
|
||||||
@ -36,7 +40,6 @@ def get_save_handler(knowledge_id, selector):
|
|||||||
|
|
||||||
|
|
||||||
def get_sync_handler(knowledge_id):
|
def get_sync_handler(knowledge_id):
|
||||||
from knowledge.serializers import DocumentSerializers
|
|
||||||
knowledge = QuerySet(Knowledge).filter(id=knowledge_id).first()
|
knowledge = QuerySet(Knowledge).filter(id=knowledge_id).first()
|
||||||
|
|
||||||
def handler(child_link: ChildLink, response: Fork.Response):
|
def handler(child_link: ChildLink, response: Fork.Response):
|
||||||
@ -52,10 +55,14 @@ def get_sync_handler(knowledge_id):
|
|||||||
DocumentSerializers.Sync(data={'document_id': first.id}).sync()
|
DocumentSerializers.Sync(data={'document_id': first.id}).sync()
|
||||||
else:
|
else:
|
||||||
# 插入
|
# 插入
|
||||||
DocumentSerializers.Create(data={'knowledge_id': knowledge.id}).save(
|
DocumentSerializers.Create(
|
||||||
{'name': document_name, 'paragraphs': paragraphs,
|
data={'knowledge_id': knowledge.id}
|
||||||
'meta': {'source_url': child_link.url.strip(), 'selector': knowledge.meta.get('selector')},
|
).save({
|
||||||
'type': KnowledgeType.WEB}, with_valid=True)
|
'name': document_name,
|
||||||
|
'paragraphs': paragraphs,
|
||||||
|
'meta': {'source_url': child_link.url.strip(), 'selector': knowledge.meta.get('selector')},
|
||||||
|
'type': KnowledgeType.WEB
|
||||||
|
}, with_valid=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.getLogger("max_kb_error").error(f'{str(e)}:{traceback.format_exc()}')
|
logging.getLogger("max_kb_error").error(f'{str(e)}:{traceback.format_exc()}')
|
||||||
|
|
||||||
@ -63,8 +70,6 @@ def get_sync_handler(knowledge_id):
|
|||||||
|
|
||||||
|
|
||||||
def get_sync_web_document_handler(knowledge_id):
|
def get_sync_web_document_handler(knowledge_id):
|
||||||
from knowledge.serializers import DocumentSerializers
|
|
||||||
|
|
||||||
def handler(source_url: str, selector, response: Fork.Response):
|
def handler(source_url: str, selector, response: Fork.Response):
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
try:
|
try:
|
||||||
@ -88,7 +93,6 @@ def get_sync_web_document_handler(knowledge_id):
|
|||||||
|
|
||||||
|
|
||||||
def save_problem(knowledge_id, document_id, paragraph_id, problem):
|
def save_problem(knowledge_id, document_id, paragraph_id, problem):
|
||||||
from knowledge.serializers import ParagraphSerializers
|
|
||||||
# print(f"knowledge_id: {knowledge_id}")
|
# print(f"knowledge_id: {knowledge_id}")
|
||||||
# print(f"document_id: {document_id}")
|
# print(f"document_id: {document_id}")
|
||||||
# print(f"paragraph_id: {paragraph_id}")
|
# print(f"paragraph_id: {paragraph_id}")
|
||||||
@ -101,7 +105,11 @@ def save_problem(knowledge_id, document_id, paragraph_id, problem):
|
|||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
ParagraphSerializers.Problem(
|
ParagraphSerializers.Problem(
|
||||||
data={"knowledge_id": knowledge_id, 'document_id': document_id,
|
data={
|
||||||
'paragraph_id': paragraph_id}).save(instance={"content": problem}, with_valid=True)
|
"knowledge_id": knowledge_id,
|
||||||
|
'document_id': document_id,
|
||||||
|
'paragraph_id': paragraph_id
|
||||||
|
}
|
||||||
|
).save(instance={"content": problem}, with_valid=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
max_kb_error.error(_('Association problem failed {error}').format(error=str(e)))
|
max_kb_error.error(_('Association problem failed {error}').format(error=str(e)))
|
||||||
|
|||||||
@ -8,7 +8,9 @@ urlpatterns = [
|
|||||||
path('workspace/<str:workspace_id>/knowledge/base', views.KnowledgeBaseView.as_view()),
|
path('workspace/<str:workspace_id>/knowledge/base', views.KnowledgeBaseView.as_view()),
|
||||||
path('workspace/<str:workspace_id>/knowledge/web', views.KnowledgeWebView.as_view()),
|
path('workspace/<str:workspace_id>/knowledge/web', views.KnowledgeWebView.as_view()),
|
||||||
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>', views.KnowledgeView.Operate.as_view()),
|
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>', views.KnowledgeView.Operate.as_view()),
|
||||||
|
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document', views.DocumentView.as_view()),
|
||||||
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/split', views.DocumentView.Split.as_view()),
|
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/split', views.DocumentView.Split.as_view()),
|
||||||
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch', views.DocumentView.Batch.as_view()),
|
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch', views.DocumentView.Batch.as_view()),
|
||||||
|
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/<str:document_id>', views.DocumentView.Operate.as_view()),
|
||||||
path('workspace/<str:workspace_id>/knowledge/<int:current_page>/<int:page_size>', views.KnowledgeView.Page.as_view()),
|
path('workspace/<str:workspace_id>/knowledge/<int:current_page>/<int:page_size>', views.KnowledgeView.Page.as_view()),
|
||||||
]
|
]
|
||||||
|
|||||||
@ -8,28 +8,45 @@ from common.auth import TokenAuth
|
|||||||
from common.auth.authentication import has_permissions
|
from common.auth.authentication import has_permissions
|
||||||
from common.constants.permission_constants import PermissionConstants
|
from common.constants.permission_constants import PermissionConstants
|
||||||
from common.result import result
|
from common.result import result
|
||||||
from knowledge.api.document import DocumentSplitAPI, DocumentBatchAPI, DocumentBatchCreateAPI
|
from knowledge.api.document import DocumentSplitAPI, DocumentBatchAPI, DocumentBatchCreateAPI, DocumentCreateAPI, \
|
||||||
|
DocumentReadAPI, DocumentEditAPI, DocumentDeleteAPI
|
||||||
from knowledge.api.knowledge import KnowledgeTreeReadAPI
|
from knowledge.api.knowledge import KnowledgeTreeReadAPI
|
||||||
from knowledge.serializers.document import DocumentSerializers
|
from knowledge.serializers.document import DocumentSerializers
|
||||||
from knowledge.serializers.knowledge import KnowledgeSerializer
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentView(APIView):
|
class DocumentView(APIView):
|
||||||
authentication_classes = [TokenAuth]
|
authentication_classes = [TokenAuth]
|
||||||
|
|
||||||
|
@extend_schema(
|
||||||
|
methods=['POST'],
|
||||||
|
description=_('Create document'),
|
||||||
|
operation_id=_('Create document'),
|
||||||
|
request=DocumentCreateAPI.get_request(),
|
||||||
|
parameters=DocumentCreateAPI.get_parameters(),
|
||||||
|
responses=DocumentCreateAPI.get_response(),
|
||||||
|
tags=[_('Knowledge Base/Documentation')]
|
||||||
|
)
|
||||||
|
@has_permissions(PermissionConstants.DOCUMENT_CREATE.get_workspace_permission())
|
||||||
|
def post(self, request: Request, workspace_id: str, knowledge_id: str):
|
||||||
|
return result.success(
|
||||||
|
DocumentSerializers.Create(
|
||||||
|
data={'workspace_id': workspace_id, 'knowledge_id': knowledge_id},
|
||||||
|
).save(request.data))
|
||||||
|
|
||||||
@extend_schema(
|
@extend_schema(
|
||||||
methods=['GET'],
|
methods=['GET'],
|
||||||
description=_('Get document'),
|
description=_('Get document'),
|
||||||
operation_id=_('Get document'),
|
operation_id=_('Get document'),
|
||||||
parameters=KnowledgeTreeReadAPI.get_parameters(),
|
parameters=KnowledgeTreeReadAPI.get_parameters(),
|
||||||
responses=KnowledgeTreeReadAPI.get_response(),
|
responses=KnowledgeTreeReadAPI.get_response(),
|
||||||
tags=[_('Knowledge Base')]
|
tags=[_('Knowledge Base/Documentation')]
|
||||||
)
|
)
|
||||||
@has_permissions(PermissionConstants.DOCUMENT_READ.get_workspace_permission())
|
@has_permissions(PermissionConstants.DOCUMENT_READ.get_workspace_permission())
|
||||||
def get(self, request: Request, workspace_id: str):
|
def get(self, request: Request, workspace_id: str, knowledge_id: str):
|
||||||
return result.success(KnowledgeSerializer.Query(
|
return result.success(DocumentSerializers.Query(
|
||||||
data={
|
data={
|
||||||
'workspace_id': workspace_id,
|
'workspace_id': workspace_id,
|
||||||
|
'knowledge_id': knowledge_id,
|
||||||
'folder_id': request.query_params.get('folder_id'),
|
'folder_id': request.query_params.get('folder_id'),
|
||||||
'name': request.query_params.get('name'),
|
'name': request.query_params.get('name'),
|
||||||
'desc': request.query_params.get("desc"),
|
'desc': request.query_params.get("desc"),
|
||||||
@ -37,6 +54,50 @@ class DocumentView(APIView):
|
|||||||
}
|
}
|
||||||
).list())
|
).list())
|
||||||
|
|
||||||
|
class Operate(APIView):
|
||||||
|
authentication_classes = [TokenAuth]
|
||||||
|
|
||||||
|
@extend_schema(
|
||||||
|
description=_('Get document details'),
|
||||||
|
operation_id=_('Get document details'),
|
||||||
|
parameters=DocumentReadAPI.get_parameters(),
|
||||||
|
responses=DocumentReadAPI.get_response(),
|
||||||
|
tags=[_('Knowledge Base/Documentation')]
|
||||||
|
)
|
||||||
|
@has_permissions(PermissionConstants.DOCUMENT_READ.get_workspace_permission())
|
||||||
|
def get(self, request: Request, knowledge_id: str, document_id: str):
|
||||||
|
operate = DocumentSerializers.Operate(data={'document_id': document_id, 'knowledge_id': knowledge_id})
|
||||||
|
operate.is_valid(raise_exception=True)
|
||||||
|
return result.success(operate.one())
|
||||||
|
|
||||||
|
@extend_schema(
|
||||||
|
description=_('Modify document'),
|
||||||
|
operation_id=_('Modify document'),
|
||||||
|
parameters=DocumentEditAPI.get_parameters(),
|
||||||
|
request=DocumentEditAPI.get_request(),
|
||||||
|
responses=DocumentEditAPI.get_response(),
|
||||||
|
tags=[_('Knowledge Base/Documentation')]
|
||||||
|
)
|
||||||
|
@has_permissions(PermissionConstants.DOCUMENT_EDIT.get_workspace_permission())
|
||||||
|
def put(self, request: Request, knowledge_id: str, document_id: str):
|
||||||
|
return result.success(
|
||||||
|
DocumentSerializers.Operate(data={'document_id': document_id, 'knowledge_id': knowledge_id}).edit(
|
||||||
|
request.data,
|
||||||
|
with_valid=True))
|
||||||
|
|
||||||
|
@extend_schema(
|
||||||
|
description=_('Delete document'),
|
||||||
|
operation_id=_('Delete document'),
|
||||||
|
parameters=DocumentDeleteAPI.get_parameters(),
|
||||||
|
responses=DocumentDeleteAPI.get_response(),
|
||||||
|
tags=[_('Knowledge Base/Documentation')]
|
||||||
|
)
|
||||||
|
@has_permissions(PermissionConstants.DOCUMENT_DELETE.get_workspace_permission())
|
||||||
|
def delete(self, request: Request, knowledge_id: str, document_id: str):
|
||||||
|
operate = DocumentSerializers.Operate(data={'document_id': document_id, 'knowledge_id': knowledge_id})
|
||||||
|
operate.is_valid(raise_exception=True)
|
||||||
|
return result.success(operate.delete())
|
||||||
|
|
||||||
class Split(APIView):
|
class Split(APIView):
|
||||||
authentication_classes = [TokenAuth]
|
authentication_classes = [TokenAuth]
|
||||||
parser_classes = [MultiPartParser]
|
parser_classes = [MultiPartParser]
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user