feat: enhance Document API with create and query functionalities

This commit is contained in:
CaptainB 2025-05-06 11:05:34 +08:00
parent ba5028858c
commit 24e734fb36
9 changed files with 455 additions and 75 deletions

View File

@ -4,29 +4,7 @@ from drf_spectacular.utils import OpenApiParameter
from common.mixins.api_mixin import APIMixin from common.mixins.api_mixin import APIMixin
from common.result import DefaultResultSerializer from common.result import DefaultResultSerializer
from knowledge.serializers.common import BatchSerializer from knowledge.serializers.common import BatchSerializer
from knowledge.serializers.document import DocumentCreateRequest, DocumentInstanceSerializer from knowledge.serializers.document import DocumentInstanceSerializer
class DocumentCreateAPI(APIMixin):
@staticmethod
def get_parameters():
return [
OpenApiParameter(
name="workspace_id",
description="工作空间id",
type=OpenApiTypes.STR,
location='path',
required=True,
)
]
@staticmethod
def get_request():
return DocumentCreateRequest
@staticmethod
def get_response():
return DefaultResultSerializer
class DocumentSplitAPI(APIMixin): class DocumentSplitAPI(APIMixin):
@ -127,3 +105,74 @@ class DocumentBatchCreateAPI(APIMixin):
@staticmethod @staticmethod
def get_response(): def get_response():
return DefaultResultSerializer return DefaultResultSerializer
class DocumentCreateAPI(APIMixin):
@staticmethod
def get_parameters():
return [
OpenApiParameter(
name="workspace_id",
description="工作空间id",
type=OpenApiTypes.STR,
location='path',
required=True,
),
OpenApiParameter(
name="knowledge_id",
description="知识库id",
type=OpenApiTypes.STR,
location='path',
required=True,
),
]
@staticmethod
def get_request():
return DocumentInstanceSerializer
@staticmethod
def get_response():
return DefaultResultSerializer
class DocumentReadAPI(APIMixin):
@staticmethod
def get_parameters():
return [
OpenApiParameter(
name="workspace_id",
description="工作空间id",
type=OpenApiTypes.STR,
location='path',
required=True,
),
OpenApiParameter(
name="knowledge_id",
description="知识库id",
type=OpenApiTypes.STR,
location='path',
required=True,
),
OpenApiParameter(
name="document_id",
description="文档id",
type=OpenApiTypes.STR,
location='path',
required=True,
),
]
@staticmethod
def get_response():
return DefaultResultSerializer
class DocumentEditAPI(DocumentReadAPI):
@staticmethod
def get_request():
return DocumentInstanceSerializer
class DocumentDeleteAPI(DocumentReadAPI):
pass

View File

@ -1,16 +1,18 @@
import logging
import os import os
import traceback
from functools import reduce from functools import reduce
from typing import Dict, List from typing import Dict, List
import uuid_utils.compat as uuid import uuid_utils.compat as uuid
from celery_once import AlreadyQueued from celery_once import AlreadyQueued
from django.db import transaction from django.db import transaction, models
from django.db.models import QuerySet, Model from django.db.models import QuerySet, Model
from django.db.models.functions import Substr, Reverse from django.db.models.functions import Substr, Reverse
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _
from rest_framework import serializers from rest_framework import serializers
from common.db.search import native_search from common.db.search import native_search, get_dynamics_model, native_page_search
from common.event import ListenerManagement from common.event import ListenerManagement
from common.event.common import work_thread_pool from common.event.common import work_thread_pool
from common.exception.app_exception import AppApiException from common.exception.app_exception import AppApiException
@ -23,12 +25,15 @@ from common.handle.impl.text.xls_split_handle import XlsSplitHandle
from common.handle.impl.text.xlsx_split_handle import XlsxSplitHandle from common.handle.impl.text.xlsx_split_handle import XlsxSplitHandle
from common.handle.impl.text.zip_split_handle import ZipSplitHandle from common.handle.impl.text.zip_split_handle import ZipSplitHandle
from common.utils.common import post, get_file_content, bulk_create_in_batches from common.utils.common import post, get_file_content, bulk_create_in_batches
from common.utils.fork import Fork
from common.utils.split_model import get_split_model
from knowledge.models import Knowledge, Paragraph, Problem, Document, KnowledgeType, ProblemParagraphMapping, State, \ from knowledge.models import Knowledge, Paragraph, Problem, Document, KnowledgeType, ProblemParagraphMapping, State, \
TaskType, File TaskType, File
from knowledge.serializers.common import ProblemParagraphManage, BatchSerializer from knowledge.serializers.common import ProblemParagraphManage, BatchSerializer, get_embedding_model_id_by_knowledge_id
from knowledge.serializers.paragraph import ParagraphSerializers, ParagraphInstanceSerializer, \ from knowledge.serializers.paragraph import ParagraphSerializers, ParagraphInstanceSerializer, \
delete_problems_and_mappings delete_problems_and_mappings
from knowledge.task import embedding_by_document, delete_embedding_by_document_list from knowledge.task.embedding import embedding_by_document, delete_embedding_by_document_list, \
delete_embedding_by_document
from maxkb.const import PROJECT_DIR from maxkb.const import PROJECT_DIR
default_split_handle = TextSplitHandle() default_split_handle = TextSplitHandle()
@ -62,13 +67,6 @@ class DocumentInstanceSerializer(serializers.Serializer):
paragraphs = ParagraphInstanceSerializer(required=False, many=True, allow_null=True) paragraphs = ParagraphInstanceSerializer(required=False, many=True, allow_null=True)
class DocumentCreateRequest(serializers.Serializer):
name = serializers.CharField(required=True, label=_('knowledge name'), max_length=64, min_length=1)
desc = serializers.CharField(required=True, label=_('knowledge description'), max_length=256, min_length=1)
embedding_model_id = serializers.UUIDField(required=True, label=_('embedding model'))
documents = DocumentInstanceSerializer(required=False, many=True)
class DocumentSplitRequest(serializers.Serializer): class DocumentSplitRequest(serializers.Serializer):
file = serializers.ListField(required=True, label=_('file list')) file = serializers.ListField(required=True, label=_('file list'))
limit = serializers.IntegerField(required=False, label=_('limit')) limit = serializers.IntegerField(required=False, label=_('limit'))
@ -80,18 +78,153 @@ class DocumentSplitRequest(serializers.Serializer):
with_filter = serializers.BooleanField(required=False, label=_('Auto Clean')) with_filter = serializers.BooleanField(required=False, label=_('Auto Clean'))
class DocumentBatchRequest(serializers.Serializer):
file = serializers.ListField(required=True, label=_('file list'))
limit = serializers.IntegerField(required=False, label=_('limit'))
patterns = serializers.ListField(
required=False,
child=serializers.CharField(required=True, label=_('patterns')),
label=_('patterns')
)
with_filter = serializers.BooleanField(required=False, label=_('Auto Clean'))
class DocumentSerializers(serializers.Serializer): class DocumentSerializers(serializers.Serializer):
class Query(serializers.Serializer):
# 知识库id
workspace_id = serializers.CharField(required=True, label=_('workspace id'))
knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id'))
name = serializers.CharField(required=False, max_length=128, min_length=1, label=_('document name'))
hit_handling_method = serializers.CharField(required=False, label=_('hit handling method'))
is_active = serializers.BooleanField(required=False, label=_('document is active'))
task_type = serializers.IntegerField(required=False, label=_('task type'))
status = serializers.CharField(required=False, label=_('status'))
order_by = serializers.CharField(required=False, label=_('order by'))
def get_query_set(self):
query_set = QuerySet(model=Document)
query_set = query_set.filter(**{'knowledge_id': self.data.get("knowledge_id")})
if 'name' in self.data and self.data.get('name') is not None:
query_set = query_set.filter(**{'name__icontains': self.data.get('name')})
if 'hit_handling_method' in self.data and self.data.get('hit_handling_method') is not None:
query_set = query_set.filter(**{'hit_handling_method': self.data.get('hit_handling_method')})
if 'is_active' in self.data and self.data.get('is_active') is not None:
query_set = query_set.filter(**{'is_active': self.data.get('is_active')})
if 'status' in self.data and self.data.get(
'status') is not None:
task_type = self.data.get('task_type')
status = self.data.get(
'status')
if task_type is not None:
query_set = query_set.annotate(
reversed_status=Reverse('status'),
task_type_status=Substr('reversed_status', TaskType(task_type).value,
1),
).filter(task_type_status=State(status).value).values('id')
else:
if status != State.SUCCESS.value:
query_set = query_set.filter(status__icontains=status)
else:
query_set = query_set.filter(status__iregex='^[2n]*$')
order_by = self.data.get('order_by', '')
order_by_query_set = QuerySet(model=get_dynamics_model(
{'char_length': models.CharField(), 'paragraph_count': models.IntegerField(),
"update_time": models.IntegerField(), 'create_time': models.DateTimeField()}))
if order_by:
order_by_query_set = order_by_query_set.order_by(order_by)
else:
order_by_query_set = order_by_query_set.order_by('-create_time', 'id')
return {
'document_custom_sql': query_set,
'order_by_query': order_by_query_set
}
def list(self, with_valid=False):
if with_valid:
self.is_valid(raise_exception=True)
query_set = self.get_query_set()
return native_search(query_set, select_string=get_file_content(
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_document.sql')))
def page(self, current_page, page_size):
query_set = self.get_query_set()
return native_page_search(current_page, page_size, query_set, select_string=get_file_content(
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_document.sql')))
class Sync(serializers.Serializer):
document_id = serializers.UUIDField(required=True, label=_('document id'))
def is_valid(self, *, raise_exception=False):
super().is_valid(raise_exception=True)
document_id = self.data.get('document_id')
first = QuerySet(Document).filter(id=document_id).first()
if first is None:
raise AppApiException(500, _('document id not exist'))
if first.type != KnowledgeType.WEB:
raise AppApiException(500, _('Synchronization is only supported for web site types'))
def sync(self, with_valid=True, with_embedding=True):
if with_valid:
self.is_valid(raise_exception=True)
document_id = self.data.get('document_id')
document = QuerySet(Document).filter(id=document_id).first()
state = State.SUCCESS
if document.type != KnowledgeType.WEB:
return True
try:
ListenerManagement.update_status(QuerySet(Document).filter(id=document_id),
TaskType.SYNC,
State.PENDING)
ListenerManagement.get_aggregation_document_status(document_id)()
source_url = document.meta.get('source_url')
selector_list = document.meta.get('selector').split(
" ") if 'selector' in document.meta and document.meta.get('selector') is not None else []
result = Fork(source_url, selector_list).fork()
if result.status == 200:
# 删除段落
QuerySet(model=Paragraph).filter(document_id=document_id).delete()
# 删除问题
QuerySet(model=ProblemParagraphMapping).filter(document_id=document_id).delete()
delete_problems_and_mappings([document_id])
# 删除向量库
delete_embedding_by_document(document_id)
paragraphs = get_split_model('web.md').parse(result.content)
char_length = reduce(lambda x, y: x + y,
[len(p.get('content')) for p in paragraphs],
0)
QuerySet(Document).filter(id=document_id).update(char_length=char_length)
document_paragraph_model = DocumentSerializers.Create.get_paragraph_model(document, paragraphs)
paragraph_model_list = document_paragraph_model.get('paragraph_model_list')
problem_paragraph_object_list = document_paragraph_model.get('problem_paragraph_object_list')
problem_model_list, problem_paragraph_mapping_list = ProblemParagraphManage(
problem_paragraph_object_list, document.knowledge_id).to_problem_model_list()
# 批量插入段落
QuerySet(Paragraph).bulk_create(paragraph_model_list) if len(paragraph_model_list) > 0 else None
# 批量插入问题
QuerySet(Problem).bulk_create(problem_model_list) if len(problem_model_list) > 0 else None
# 插入关联问题
QuerySet(ProblemParagraphMapping).bulk_create(problem_paragraph_mapping_list) if len(
problem_paragraph_mapping_list) > 0 else None
# 向量化
if with_embedding:
embedding_model_id = get_embedding_model_id_by_knowledge_id(document.knowledge_id)
ListenerManagement.update_status(QuerySet(Document).filter(id=document_id),
TaskType.EMBEDDING,
State.PENDING)
ListenerManagement.update_status(QuerySet(Paragraph).filter(document_id=document_id),
TaskType.EMBEDDING,
State.PENDING)
ListenerManagement.get_aggregation_document_status(document_id)()
embedding_by_document.delay(document_id, embedding_model_id)
else:
state = State.FAILURE
except Exception as e:
logging.getLogger("max_kb_error").error(f'{str(e)}:{traceback.format_exc()}')
state = State.FAILURE
ListenerManagement.update_status(
QuerySet(Document).filter(id=document_id),
TaskType.SYNC,
state
)
ListenerManagement.update_status(
QuerySet(Paragraph).filter(document_id=document_id),
TaskType.SYNC,
state
)
ListenerManagement.get_aggregation_document_status(document_id)()
return True
class Operate(serializers.Serializer): class Operate(serializers.Serializer):
document_id = serializers.UUIDField(required=True, label=_('document id')) document_id = serializers.UUIDField(required=True, label=_('document id'))
knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id')) knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id'))
@ -148,6 +281,7 @@ class DocumentSerializers(serializers.Serializer):
raise AppApiException(500, _('The task is being executed, please do not send it repeatedly.')) raise AppApiException(500, _('The task is being executed, please do not send it repeatedly.'))
class Create(serializers.Serializer): class Create(serializers.Serializer):
workspace_id = serializers.UUIDField(required=True, label=_('workspace id'))
knowledge_id = serializers.UUIDField(required=True, label=_('document id')) knowledge_id = serializers.UUIDField(required=True, label=_('document id'))
def is_valid(self, *, raise_exception=False): def is_valid(self, *, raise_exception=False):
@ -166,7 +300,7 @@ class DocumentSerializers(serializers.Serializer):
@transaction.atomic @transaction.atomic
def save(self, instance: Dict, with_valid=False, **kwargs): def save(self, instance: Dict, with_valid=False, **kwargs):
if with_valid: if with_valid:
DocumentCreateRequest(data=instance).is_valid(raise_exception=True) DocumentInstanceSerializer(data=instance).is_valid(raise_exception=True)
self.is_valid(raise_exception=True) self.is_valid(raise_exception=True)
knowledge_id = self.data.get('knowledge_id') knowledge_id = self.data.get('knowledge_id')
document_paragraph_model = self.get_document_paragraph_model(knowledge_id, instance) document_paragraph_model = self.get_document_paragraph_model(knowledge_id, instance)

View File

@ -16,7 +16,8 @@ from knowledge.models import Knowledge, KnowledgeScope, KnowledgeType, Document,
ProblemParagraphMapping, ApplicationKnowledgeMapping ProblemParagraphMapping, ApplicationKnowledgeMapping
from knowledge.serializers.common import ProblemParagraphManage, get_embedding_model_id_by_knowledge_id, MetaSerializer from knowledge.serializers.common import ProblemParagraphManage, get_embedding_model_id_by_knowledge_id, MetaSerializer
from knowledge.serializers.document import DocumentSerializers from knowledge.serializers.document import DocumentSerializers
from knowledge.task import sync_web_knowledge, embedding_by_knowledge, delete_embedding_by_knowledge from knowledge.task.embedding import embedding_by_knowledge, delete_embedding_by_knowledge
from knowledge.task.sync import sync_web_knowledge
from maxkb.conf import PROJECT_DIR from maxkb.conf import PROJECT_DIR

View File

@ -10,12 +10,13 @@ from rest_framework import serializers
from common.exception.app_exception import AppApiException from common.exception.app_exception import AppApiException
from common.utils.common import post from common.utils.common import post
from knowledge.models import Paragraph, Problem, Document, ProblemParagraphMapping from knowledge.models import Paragraph, Problem, Document, ProblemParagraphMapping, SourceType
from knowledge.serializers.common import ProblemParagraphObject, ProblemParagraphManage, \ from knowledge.serializers.common import ProblemParagraphObject, ProblemParagraphManage, \
get_embedding_model_id_by_knowledge_id, update_document_char_length get_embedding_model_id_by_knowledge_id, update_document_char_length
from knowledge.serializers.problem import ProblemInstanceSerializer, ProblemSerializer from knowledge.serializers.problem import ProblemInstanceSerializer, ProblemSerializer, ProblemSerializers
from knowledge.task import embedding_by_paragraph, enable_embedding_by_paragraph, disable_embedding_by_paragraph, \ from knowledge.task.embedding import embedding_by_paragraph, enable_embedding_by_paragraph, \
delete_embedding_by_paragraph disable_embedding_by_paragraph, \
delete_embedding_by_paragraph, embedding_by_problem as embedding_by_problem_task
class ParagraphSerializer(serializers.ModelSerializer): class ParagraphSerializer(serializers.ModelSerializer):
@ -49,6 +50,70 @@ class ParagraphSerializers(serializers.Serializer):
allow_blank=True) allow_blank=True)
content = serializers.CharField(required=True, max_length=102400, label=_('section title')) content = serializers.CharField(required=True, max_length=102400, label=_('section title'))
class Problem(serializers.Serializer):
knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id'))
document_id = serializers.UUIDField(required=True, label=_('document id'))
paragraph_id = serializers.UUIDField(required=True, label=_('paragraph id'))
def is_valid(self, *, raise_exception=False):
super().is_valid(raise_exception=True)
if not QuerySet(Paragraph).filter(id=self.data.get('paragraph_id')).exists():
raise AppApiException(500, _('Paragraph id does not exist'))
def list(self, with_valid=False):
"""
获取问题列表
:param with_valid: 是否校验
:return: 问题列表
"""
if with_valid:
self.is_valid(raise_exception=True)
problem_paragraph_mapping = QuerySet(ProblemParagraphMapping).filter(
knowledge_id=self.data.get("knowledge_id"),
paragraph_id=self.data.get(
'paragraph_id'))
return [ProblemSerializer(row).data for row in
QuerySet(Problem).filter(id__in=[row.problem_id for row in problem_paragraph_mapping])]
@transaction.atomic
def save(self, instance: Dict, with_valid=True, with_embedding=True, embedding_by_problem=None):
if with_valid:
self.is_valid()
ProblemInstanceSerializer(data=instance).is_valid(raise_exception=True)
problem = QuerySet(Problem).filter(knowledge_id=self.data.get('knowledge_id'),
content=instance.get('content')).first()
if problem is None:
problem = Problem(id=uuid.uuid7(), knowledge_id=self.data.get('knowledge_id'),
content=instance.get('content'))
problem.save()
if QuerySet(ProblemParagraphMapping).filter(knowledge_id=self.data.get('knowledge_id'),
problem_id=problem.id,
paragraph_id=self.data.get('paragraph_id')).exists():
raise AppApiException(500, _('Already associated, please do not associate again'))
problem_paragraph_mapping = ProblemParagraphMapping(
id=uuid.uuid7(),
problem_id=problem.id,
document_id=self.data.get('document_id'),
paragraph_id=self.data.get('paragraph_id'),
knowledge_id=self.data.get('knowledge_id')
)
problem_paragraph_mapping.save()
model_id = get_embedding_model_id_by_knowledge_id(self.data.get('knowledge_id'))
if with_embedding:
embedding_by_problem_task({
'text': problem.content,
'is_active': True,
'source_type': SourceType.PROBLEM,
'source_id': problem_paragraph_mapping.id,
'document_id': self.data.get('document_id'),
'paragraph_id': self.data.get('paragraph_id'),
'knowledge_id': self.data.get('knowledge_id'),
}, model_id)
return ProblemSerializers.Operate(
data={'knowledge_id': self.data.get('knowledge_id'), 'problem_id': problem.id}
).one(with_valid=True)
class Operate(serializers.Serializer): class Operate(serializers.Serializer):
# 段落id # 段落id
paragraph_id = serializers.UUIDField(required=True, label=_('paragraph id')) paragraph_id = serializers.UUIDField(required=True, label=_('paragraph id'))

View File

@ -1,7 +1,17 @@
import os
from typing import Dict
from django.db import transaction
from django.db.models import QuerySet
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _
from rest_framework import serializers from rest_framework import serializers
from knowledge.models import Problem from common.db.search import native_search
from common.utils.common import get_file_content
from knowledge.models import Problem, ProblemParagraphMapping, Paragraph, Knowledge
from knowledge.serializers.common import get_embedding_model_id_by_knowledge_id
from knowledge.task.embedding import delete_embedding_by_source_ids, update_problem_embedding
from maxkb.const import PROJECT_DIR
class ProblemSerializer(serializers.ModelSerializer): class ProblemSerializer(serializers.ModelSerializer):
@ -13,3 +23,55 @@ class ProblemSerializer(serializers.ModelSerializer):
class ProblemInstanceSerializer(serializers.Serializer): class ProblemInstanceSerializer(serializers.Serializer):
id = serializers.CharField(required=False, label=_('problem id')) id = serializers.CharField(required=False, label=_('problem id'))
content = serializers.CharField(required=True, max_length=256, label=_('content')) content = serializers.CharField(required=True, max_length=256, label=_('content'))
class ProblemSerializers(serializers.Serializer):
class Operate(serializers.Serializer):
knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id'))
problem_id = serializers.UUIDField(required=True, label=_('problem id'))
def list_paragraph(self, with_valid=True):
if with_valid:
self.is_valid(raise_exception=True)
problem_paragraph_mapping = QuerySet(ProblemParagraphMapping).filter(
knowledge_id=self.data.get("knowledge_id"),
problem_id=self.data.get("problem_id")
)
if problem_paragraph_mapping is None or len(problem_paragraph_mapping) == 0:
return []
return native_search(
QuerySet(Paragraph).filter(id__in=[row.paragraph_id for row in problem_paragraph_mapping]),
select_string=get_file_content(
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_paragraph.sql')))
def one(self, with_valid=True):
if with_valid:
self.is_valid(raise_exception=True)
return ProblemInstanceSerializer(QuerySet(Problem).get(**{'id': self.data.get('problem_id')})).data
@transaction.atomic
def delete(self, with_valid=True):
if with_valid:
self.is_valid(raise_exception=True)
problem_paragraph_mapping_list = QuerySet(ProblemParagraphMapping).filter(
knowledge_id=self.data.get('knowledge_id'),
problem_id=self.data.get('problem_id'))
source_ids = [row.id for row in problem_paragraph_mapping_list]
problem_paragraph_mapping_list.delete()
QuerySet(Problem).filter(id=self.data.get('problem_id')).delete()
delete_embedding_by_source_ids(source_ids)
return True
@transaction.atomic
def edit(self, instance: Dict, with_valid=True):
if with_valid:
self.is_valid(raise_exception=True)
problem_id = self.data.get('problem_id')
knowledge_id = self.data.get('knowledge_id')
content = instance.get('content')
problem = QuerySet(Problem).filter(id=problem_id, knowledge_id=knowledge_id).first()
QuerySet(Knowledge).filter(id=knowledge_id)
problem.content = content
problem.save()
model_id = get_embedding_model_id_by_knowledge_id(knowledge_id)
update_problem_embedding(problem_id, content, model_id)

View File

@ -1,2 +0,0 @@
from .sync import *
from .embedding import *

View File

@ -11,24 +11,28 @@ from django.utils.translation import gettext_lazy as _
from common.utils.fork import ChildLink, Fork from common.utils.fork import ChildLink, Fork
from common.utils.split_model import get_split_model from common.utils.split_model import get_split_model
from knowledge.models.knowledge import KnowledgeType, Document, Knowledge, Status from knowledge.models.knowledge import KnowledgeType, Document, Knowledge, Status
from knowledge.serializers.document import DocumentSerializers
from knowledge.serializers.paragraph import ParagraphSerializers
max_kb_error = logging.getLogger("max_kb_error") max_kb_error = logging.getLogger("max_kb_error")
max_kb = logging.getLogger("max_kb") max_kb = logging.getLogger("max_kb")
def get_save_handler(knowledge_id, selector): def get_save_handler(knowledge_id, selector):
from knowledge.serializers import DocumentSerializers
def handler(child_link: ChildLink, response: Fork.Response): def handler(child_link: ChildLink, response: Fork.Response):
if response.status == 200: if response.status == 200:
try: try:
document_name = child_link.tag.text if child_link.tag is not None and len( document_name = child_link.tag.text if child_link.tag is not None and len(
child_link.tag.text.strip()) > 0 else child_link.url child_link.tag.text.strip()) > 0 else child_link.url
paragraphs = get_split_model('web.md').parse(response.content) paragraphs = get_split_model('web.md').parse(response.content)
DocumentSerializers.Create(data={'knowledge_id': knowledge_id}).save( DocumentSerializers.Create(
{'name': document_name, 'paragraphs': paragraphs, data={'knowledge_id': knowledge_id}
'meta': {'source_url': child_link.url, 'selector': selector}, ).save({
'type': KnowledgeType.WEB}, with_valid=True) 'name': document_name,
'paragraphs': paragraphs,
'meta': {'source_url': child_link.url, 'selector': selector},
'type': KnowledgeType.WEB
}, with_valid=True)
except Exception as e: except Exception as e:
logging.getLogger("max_kb_error").error(f'{str(e)}:{traceback.format_exc()}') logging.getLogger("max_kb_error").error(f'{str(e)}:{traceback.format_exc()}')
@ -36,7 +40,6 @@ def get_save_handler(knowledge_id, selector):
def get_sync_handler(knowledge_id): def get_sync_handler(knowledge_id):
from knowledge.serializers import DocumentSerializers
knowledge = QuerySet(Knowledge).filter(id=knowledge_id).first() knowledge = QuerySet(Knowledge).filter(id=knowledge_id).first()
def handler(child_link: ChildLink, response: Fork.Response): def handler(child_link: ChildLink, response: Fork.Response):
@ -52,10 +55,14 @@ def get_sync_handler(knowledge_id):
DocumentSerializers.Sync(data={'document_id': first.id}).sync() DocumentSerializers.Sync(data={'document_id': first.id}).sync()
else: else:
# 插入 # 插入
DocumentSerializers.Create(data={'knowledge_id': knowledge.id}).save( DocumentSerializers.Create(
{'name': document_name, 'paragraphs': paragraphs, data={'knowledge_id': knowledge.id}
'meta': {'source_url': child_link.url.strip(), 'selector': knowledge.meta.get('selector')}, ).save({
'type': KnowledgeType.WEB}, with_valid=True) 'name': document_name,
'paragraphs': paragraphs,
'meta': {'source_url': child_link.url.strip(), 'selector': knowledge.meta.get('selector')},
'type': KnowledgeType.WEB
}, with_valid=True)
except Exception as e: except Exception as e:
logging.getLogger("max_kb_error").error(f'{str(e)}:{traceback.format_exc()}') logging.getLogger("max_kb_error").error(f'{str(e)}:{traceback.format_exc()}')
@ -63,8 +70,6 @@ def get_sync_handler(knowledge_id):
def get_sync_web_document_handler(knowledge_id): def get_sync_web_document_handler(knowledge_id):
from knowledge.serializers import DocumentSerializers
def handler(source_url: str, selector, response: Fork.Response): def handler(source_url: str, selector, response: Fork.Response):
if response.status == 200: if response.status == 200:
try: try:
@ -88,7 +93,6 @@ def get_sync_web_document_handler(knowledge_id):
def save_problem(knowledge_id, document_id, paragraph_id, problem): def save_problem(knowledge_id, document_id, paragraph_id, problem):
from knowledge.serializers import ParagraphSerializers
# print(f"knowledge_id: {knowledge_id}") # print(f"knowledge_id: {knowledge_id}")
# print(f"document_id: {document_id}") # print(f"document_id: {document_id}")
# print(f"paragraph_id: {paragraph_id}") # print(f"paragraph_id: {paragraph_id}")
@ -101,7 +105,11 @@ def save_problem(knowledge_id, document_id, paragraph_id, problem):
return return
try: try:
ParagraphSerializers.Problem( ParagraphSerializers.Problem(
data={"knowledge_id": knowledge_id, 'document_id': document_id, data={
'paragraph_id': paragraph_id}).save(instance={"content": problem}, with_valid=True) "knowledge_id": knowledge_id,
'document_id': document_id,
'paragraph_id': paragraph_id
}
).save(instance={"content": problem}, with_valid=True)
except Exception as e: except Exception as e:
max_kb_error.error(_('Association problem failed {error}').format(error=str(e))) max_kb_error.error(_('Association problem failed {error}').format(error=str(e)))

View File

@ -8,7 +8,9 @@ urlpatterns = [
path('workspace/<str:workspace_id>/knowledge/base', views.KnowledgeBaseView.as_view()), path('workspace/<str:workspace_id>/knowledge/base', views.KnowledgeBaseView.as_view()),
path('workspace/<str:workspace_id>/knowledge/web', views.KnowledgeWebView.as_view()), path('workspace/<str:workspace_id>/knowledge/web', views.KnowledgeWebView.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>', views.KnowledgeView.Operate.as_view()), path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>', views.KnowledgeView.Operate.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document', views.DocumentView.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/split', views.DocumentView.Split.as_view()), path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/split', views.DocumentView.Split.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch', views.DocumentView.Batch.as_view()), path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/batch', views.DocumentView.Batch.as_view()),
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/<str:document_id>', views.DocumentView.Operate.as_view()),
path('workspace/<str:workspace_id>/knowledge/<int:current_page>/<int:page_size>', views.KnowledgeView.Page.as_view()), path('workspace/<str:workspace_id>/knowledge/<int:current_page>/<int:page_size>', views.KnowledgeView.Page.as_view()),
] ]

View File

@ -8,28 +8,45 @@ from common.auth import TokenAuth
from common.auth.authentication import has_permissions from common.auth.authentication import has_permissions
from common.constants.permission_constants import PermissionConstants from common.constants.permission_constants import PermissionConstants
from common.result import result from common.result import result
from knowledge.api.document import DocumentSplitAPI, DocumentBatchAPI, DocumentBatchCreateAPI from knowledge.api.document import DocumentSplitAPI, DocumentBatchAPI, DocumentBatchCreateAPI, DocumentCreateAPI, \
DocumentReadAPI, DocumentEditAPI, DocumentDeleteAPI
from knowledge.api.knowledge import KnowledgeTreeReadAPI from knowledge.api.knowledge import KnowledgeTreeReadAPI
from knowledge.serializers.document import DocumentSerializers from knowledge.serializers.document import DocumentSerializers
from knowledge.serializers.knowledge import KnowledgeSerializer
class DocumentView(APIView): class DocumentView(APIView):
authentication_classes = [TokenAuth] authentication_classes = [TokenAuth]
@extend_schema(
methods=['POST'],
description=_('Create document'),
operation_id=_('Create document'),
request=DocumentCreateAPI.get_request(),
parameters=DocumentCreateAPI.get_parameters(),
responses=DocumentCreateAPI.get_response(),
tags=[_('Knowledge Base/Documentation')]
)
@has_permissions(PermissionConstants.DOCUMENT_CREATE.get_workspace_permission())
def post(self, request: Request, workspace_id: str, knowledge_id: str):
return result.success(
DocumentSerializers.Create(
data={'workspace_id': workspace_id, 'knowledge_id': knowledge_id},
).save(request.data))
@extend_schema( @extend_schema(
methods=['GET'], methods=['GET'],
description=_('Get document'), description=_('Get document'),
operation_id=_('Get document'), operation_id=_('Get document'),
parameters=KnowledgeTreeReadAPI.get_parameters(), parameters=KnowledgeTreeReadAPI.get_parameters(),
responses=KnowledgeTreeReadAPI.get_response(), responses=KnowledgeTreeReadAPI.get_response(),
tags=[_('Knowledge Base')] tags=[_('Knowledge Base/Documentation')]
) )
@has_permissions(PermissionConstants.DOCUMENT_READ.get_workspace_permission()) @has_permissions(PermissionConstants.DOCUMENT_READ.get_workspace_permission())
def get(self, request: Request, workspace_id: str): def get(self, request: Request, workspace_id: str, knowledge_id: str):
return result.success(KnowledgeSerializer.Query( return result.success(DocumentSerializers.Query(
data={ data={
'workspace_id': workspace_id, 'workspace_id': workspace_id,
'knowledge_id': knowledge_id,
'folder_id': request.query_params.get('folder_id'), 'folder_id': request.query_params.get('folder_id'),
'name': request.query_params.get('name'), 'name': request.query_params.get('name'),
'desc': request.query_params.get("desc"), 'desc': request.query_params.get("desc"),
@ -37,6 +54,50 @@ class DocumentView(APIView):
} }
).list()) ).list())
class Operate(APIView):
authentication_classes = [TokenAuth]
@extend_schema(
description=_('Get document details'),
operation_id=_('Get document details'),
parameters=DocumentReadAPI.get_parameters(),
responses=DocumentReadAPI.get_response(),
tags=[_('Knowledge Base/Documentation')]
)
@has_permissions(PermissionConstants.DOCUMENT_READ.get_workspace_permission())
def get(self, request: Request, knowledge_id: str, document_id: str):
operate = DocumentSerializers.Operate(data={'document_id': document_id, 'knowledge_id': knowledge_id})
operate.is_valid(raise_exception=True)
return result.success(operate.one())
@extend_schema(
description=_('Modify document'),
operation_id=_('Modify document'),
parameters=DocumentEditAPI.get_parameters(),
request=DocumentEditAPI.get_request(),
responses=DocumentEditAPI.get_response(),
tags=[_('Knowledge Base/Documentation')]
)
@has_permissions(PermissionConstants.DOCUMENT_EDIT.get_workspace_permission())
def put(self, request: Request, knowledge_id: str, document_id: str):
return result.success(
DocumentSerializers.Operate(data={'document_id': document_id, 'knowledge_id': knowledge_id}).edit(
request.data,
with_valid=True))
@extend_schema(
description=_('Delete document'),
operation_id=_('Delete document'),
parameters=DocumentDeleteAPI.get_parameters(),
responses=DocumentDeleteAPI.get_response(),
tags=[_('Knowledge Base/Documentation')]
)
@has_permissions(PermissionConstants.DOCUMENT_DELETE.get_workspace_permission())
def delete(self, request: Request, knowledge_id: str, document_id: str):
operate = DocumentSerializers.Operate(data={'document_id': document_id, 'knowledge_id': knowledge_id})
operate.is_valid(raise_exception=True)
return result.success(operate.delete())
class Split(APIView): class Split(APIView):
authentication_classes = [TokenAuth] authentication_classes = [TokenAuth]
parser_classes = [MultiPartParser] parser_classes = [MultiPartParser]