Pr@main@pdf (#23)

* feat: 分段API支持word,pdf * fix: 通用型知识库支持上传 PDF/DOC 格式的文档#19 --------- Co-authored-by: wangdan-fit2cloud <dan.wang@fit2cloud.com>
2024-03-29 18:28:05 +08:00 · 2024-03-29 18:28:05 +08:00 · c55bb3f6e5
commit c55bb3f6e5
parent da4b5be7a5
7 changed files with 186 additions and 14 deletions
--- a/apps/common/handle/base_split_handle.py
+++ b/apps/common/handle/base_split_handle.py
@ -0,0 +1,20 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： base_split_handle.py
    @date：2024/3/27 18:13
    @desc:
 """
 from abc import ABC, abstractmethod
 from typing import List
 class BaseSplitHandle(ABC):
    @abstractmethod
    def support(self, file, get_buffer):
        pass
    @abstractmethod
    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
        pass
--- a/apps/common/handle/impl/doc_split_handle.py
+++ b/apps/common/handle/impl/doc_split_handle.py
@ -0,0 +1,45 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： text_split_handle.py
    @date：2024/3/27 18:19
    @desc:
 """
 import io
 import re
 from typing import List
 from docx import Document
 from common.handle.base_split_handle import BaseSplitHandle
 from common.util.split_model import SplitModel
 default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
                        re.compile("(?<!#)### (?!#).*"),
                        re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
                        re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]
 class DocSplitHandle(BaseSplitHandle):
    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
        try:
            buffer = get_buffer(file)
            doc = Document(io.BytesIO(buffer))
            content = "\n".join([para.text for para in doc.paragraphs])
            if pattern_list is not None and len(pattern_list) > 0:
                split_model = SplitModel(pattern_list, with_filter, limit)
            else:
                split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
        except BaseException as e:
            return {'name': file.name,
                    'content': []}
        return {'name': file.name,
                'content': split_model.parse(content)
                }
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
        if file_name.endswith(".docx") or file_name.endswith(".doc"):
            return True
        return False
--- a/apps/common/handle/impl/pdf_split_handle.py
+++ b/apps/common/handle/impl/pdf_split_handle.py
@ -0,0 +1,50 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： text_split_handle.py
    @date：2024/3/27 18:19
    @desc:
 """
 import re
 from typing import List
 import fitz
 from common.handle.base_split_handle import BaseSplitHandle
 from common.util.split_model import SplitModel
 default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
                        re.compile("(?<!#)### (?!#).*"),
                        re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
                        re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]
 def number_to_text(pdf_document, page_number):
    page = pdf_document.load_page(page_number)
    text = page.get_text()
    return text
 class PdfSplitHandle(BaseSplitHandle):
    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
        try:
            buffer = get_buffer(file)
            pdf_document = fitz.open(file.name, buffer)
            content = "\n".join([number_to_text(pdf_document, page_number) for page_number in range(len(pdf_document))])
            if pattern_list is not None and len(pattern_list) > 0:
                split_model = SplitModel(pattern_list, with_filter, limit)
            else:
                split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
        except BaseException as e:
            return {'name': file.name,
                    'content': []}
        return {'name': file.name,
                'content': split_model.parse(content)
                }
    def support(self, file, get_buffer):
        file_name: str = file.name.lower()
        if file_name.endswith(".pdf"):
            return True
        return False
--- a/apps/common/handle/impl/text_split_handle.py
+++ b/apps/common/handle/impl/text_split_handle.py
@ -0,0 +1,47 @@
 # coding=utf-8
 """
    @project: maxkb
    @Author：虎
    @file： text_split_handle.py
    @date：2024/3/27 18:19
    @desc:
 """
 import re
 from typing import List
 import chardet
 from common.handle.base_split_handle import BaseSplitHandle
 from common.util.split_model import SplitModel
 default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
                        re.compile("(?<!#)### (?!#).*"),
                        re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
                        re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]
 class TextSplitHandle(BaseSplitHandle):
    def support(self, file, get_buffer):
        buffer = get_buffer(file)
        file_name: str = file.name.lower()
        if file_name.endswith(".md") or file_name.endswith('.txt'):
            return True
        result = chardet.detect(buffer)
        if result['encoding'] != 'ascii' and result['confidence'] > 0.5:
            return True
        return False
    def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
        buffer = get_buffer(file)
        if pattern_list is not None and len(pattern_list) > 0:
            split_model = SplitModel(pattern_list, with_filter, limit)
        else:
            split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
        try:
            content = buffer.decode(chardet.detect(buffer)['encoding'])
        except BaseException as e:
            return {'name': file.name,
                    'content': []}
        return {'name': file.name,
                'content': split_model.parse(content)
                }
--- a/apps/dataset/serializers/document_serializers.py
+++ b/apps/dataset/serializers/document_serializers.py
@ -22,6 +22,9 @@ from common.db.search import native_search, native_page_search
 from common.event.common import work_thread_pool
 from common.event.listener_manage import ListenerManagement, SyncWebDocumentArgs
 from common.exception.app_exception import AppApiException
 from common.handle.impl.doc_split_handle import DocSplitHandle
 from common.handle.impl.pdf_split_handle import PdfSplitHandle
 from common.handle.impl.text_split_handle import TextSplitHandle
 from common.mixins.api_mixin import ApiMixin
 from common.util.common import post
 from common.util.field_message import ErrMessage
@ -593,17 +596,22 @@ class DocumentSerializers(ApiMixin, serializers.Serializer):
            return True
 class FileBufferHandle:
    buffer = None
    def get_buffer(self, file):
        if self.buffer is None:
            self.buffer = file.read()
        return self.buffer
 default_split_handle = TextSplitHandle()
 split_handles = [DocSplitHandle(), PdfSplitHandle(), default_split_handle]
 def file_to_paragraph(file, pattern_list: List, with_filter: bool, limit: int):
-    data = file.read()
+    get_buffer = FileBufferHandle().get_buffer
-    if pattern_list is not None and len(pattern_list) > 0:
+    for split_handle in split_handles:
-        split_model = SplitModel(pattern_list, with_filter, limit)
+        if split_handle.support(file, get_buffer):
-    else:
+            return split_handle.handle(file, pattern_list, with_filter, limit, get_buffer)
-        split_model = get_split_model(file.name, with_filter=with_filter, limit=limit)
+    return default_split_handle.handle(file, pattern_list, with_filter, limit, get_buffer)
    try:
        content = data.decode(chardet.detect(data)['encoding'])
    except BaseException as e:
        return {'name': file.name,
                'content': []}
    return {'name': file.name,
            'content': split_model.parse(content)
            }
--- a/pyproject.toml
+++ b/pyproject.toml
@ -31,6 +31,8 @@ langchain-openai = "^0.0.8"
 django-ipware = "^6.0.4"
 django-apscheduler = "^0.6.2"
 chardet2 = "^2.0.3"
 pymupdf = "^1.24.0"
 python-docx = "^1.1.0"
 [build-system]
 requires = ["poetry-core"]
--- a/ui/src/views/dataset/component/UploadComponent.vue
+++ b/ui/src/views/dataset/component/UploadComponent.vue
@ -16,7 +16,7 @@
        action="#"
        :auto-upload="false"
        :show-file-list="false"
-        accept=".txt, .md, .csv, .log"
+        accept=".txt, .md, .csv, .log, .doc, .docx, .pdf"
        :limit="50"
        :on-exceed="onExceed"
      >