Pr@main@pdf (#23)
* feat: 分段API支持word,pdf * fix: 通用型知识库支持上传 PDF/DOC 格式的文档#19 --------- Co-authored-by: wangdan-fit2cloud <dan.wang@fit2cloud.com>
This commit is contained in:
parent
da4b5be7a5
commit
c55bb3f6e5
20
apps/common/handle/base_split_handle.py
Normal file
20
apps/common/handle/base_split_handle.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
@project: maxkb
|
||||||
|
@Author:虎
|
||||||
|
@file: base_split_handle.py
|
||||||
|
@date:2024/3/27 18:13
|
||||||
|
@desc:
|
||||||
|
"""
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
class BaseSplitHandle(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
def support(self, file, get_buffer):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
|
||||||
|
pass
|
||||||
45
apps/common/handle/impl/doc_split_handle.py
Normal file
45
apps/common/handle/impl/doc_split_handle.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
@project: maxkb
|
||||||
|
@Author:虎
|
||||||
|
@file: text_split_handle.py
|
||||||
|
@date:2024/3/27 18:19
|
||||||
|
@desc:
|
||||||
|
"""
|
||||||
|
import io
|
||||||
|
import re
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from docx import Document
|
||||||
|
|
||||||
|
from common.handle.base_split_handle import BaseSplitHandle
|
||||||
|
from common.util.split_model import SplitModel
|
||||||
|
|
||||||
|
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
|
||||||
|
re.compile("(?<!#)### (?!#).*"),
|
||||||
|
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
|
||||||
|
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]
|
||||||
|
|
||||||
|
|
||||||
|
class DocSplitHandle(BaseSplitHandle):
|
||||||
|
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
|
||||||
|
try:
|
||||||
|
buffer = get_buffer(file)
|
||||||
|
doc = Document(io.BytesIO(buffer))
|
||||||
|
content = "\n".join([para.text for para in doc.paragraphs])
|
||||||
|
if pattern_list is not None and len(pattern_list) > 0:
|
||||||
|
split_model = SplitModel(pattern_list, with_filter, limit)
|
||||||
|
else:
|
||||||
|
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
|
||||||
|
except BaseException as e:
|
||||||
|
return {'name': file.name,
|
||||||
|
'content': []}
|
||||||
|
return {'name': file.name,
|
||||||
|
'content': split_model.parse(content)
|
||||||
|
}
|
||||||
|
|
||||||
|
def support(self, file, get_buffer):
|
||||||
|
file_name: str = file.name.lower()
|
||||||
|
if file_name.endswith(".docx") or file_name.endswith(".doc"):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
50
apps/common/handle/impl/pdf_split_handle.py
Normal file
50
apps/common/handle/impl/pdf_split_handle.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
@project: maxkb
|
||||||
|
@Author:虎
|
||||||
|
@file: text_split_handle.py
|
||||||
|
@date:2024/3/27 18:19
|
||||||
|
@desc:
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import fitz
|
||||||
|
|
||||||
|
from common.handle.base_split_handle import BaseSplitHandle
|
||||||
|
from common.util.split_model import SplitModel
|
||||||
|
|
||||||
|
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
|
||||||
|
re.compile("(?<!#)### (?!#).*"),
|
||||||
|
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
|
||||||
|
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]
|
||||||
|
|
||||||
|
|
||||||
|
def number_to_text(pdf_document, page_number):
|
||||||
|
page = pdf_document.load_page(page_number)
|
||||||
|
text = page.get_text()
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
class PdfSplitHandle(BaseSplitHandle):
|
||||||
|
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
|
||||||
|
try:
|
||||||
|
buffer = get_buffer(file)
|
||||||
|
pdf_document = fitz.open(file.name, buffer)
|
||||||
|
content = "\n".join([number_to_text(pdf_document, page_number) for page_number in range(len(pdf_document))])
|
||||||
|
if pattern_list is not None and len(pattern_list) > 0:
|
||||||
|
split_model = SplitModel(pattern_list, with_filter, limit)
|
||||||
|
else:
|
||||||
|
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
|
||||||
|
except BaseException as e:
|
||||||
|
return {'name': file.name,
|
||||||
|
'content': []}
|
||||||
|
return {'name': file.name,
|
||||||
|
'content': split_model.parse(content)
|
||||||
|
}
|
||||||
|
|
||||||
|
def support(self, file, get_buffer):
|
||||||
|
file_name: str = file.name.lower()
|
||||||
|
if file_name.endswith(".pdf"):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
47
apps/common/handle/impl/text_split_handle.py
Normal file
47
apps/common/handle/impl/text_split_handle.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
"""
|
||||||
|
@project: maxkb
|
||||||
|
@Author:虎
|
||||||
|
@file: text_split_handle.py
|
||||||
|
@date:2024/3/27 18:19
|
||||||
|
@desc:
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import chardet
|
||||||
|
|
||||||
|
from common.handle.base_split_handle import BaseSplitHandle
|
||||||
|
from common.util.split_model import SplitModel
|
||||||
|
|
||||||
|
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'),
|
||||||
|
re.compile("(?<!#)### (?!#).*"),
|
||||||
|
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
|
||||||
|
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")]
|
||||||
|
|
||||||
|
|
||||||
|
class TextSplitHandle(BaseSplitHandle):
|
||||||
|
def support(self, file, get_buffer):
|
||||||
|
buffer = get_buffer(file)
|
||||||
|
file_name: str = file.name.lower()
|
||||||
|
if file_name.endswith(".md") or file_name.endswith('.txt'):
|
||||||
|
return True
|
||||||
|
result = chardet.detect(buffer)
|
||||||
|
if result['encoding'] != 'ascii' and result['confidence'] > 0.5:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer):
|
||||||
|
buffer = get_buffer(file)
|
||||||
|
if pattern_list is not None and len(pattern_list) > 0:
|
||||||
|
split_model = SplitModel(pattern_list, with_filter, limit)
|
||||||
|
else:
|
||||||
|
split_model = SplitModel(default_pattern_list, with_filter=with_filter, limit=limit)
|
||||||
|
try:
|
||||||
|
content = buffer.decode(chardet.detect(buffer)['encoding'])
|
||||||
|
except BaseException as e:
|
||||||
|
return {'name': file.name,
|
||||||
|
'content': []}
|
||||||
|
return {'name': file.name,
|
||||||
|
'content': split_model.parse(content)
|
||||||
|
}
|
||||||
@ -22,6 +22,9 @@ from common.db.search import native_search, native_page_search
|
|||||||
from common.event.common import work_thread_pool
|
from common.event.common import work_thread_pool
|
||||||
from common.event.listener_manage import ListenerManagement, SyncWebDocumentArgs
|
from common.event.listener_manage import ListenerManagement, SyncWebDocumentArgs
|
||||||
from common.exception.app_exception import AppApiException
|
from common.exception.app_exception import AppApiException
|
||||||
|
from common.handle.impl.doc_split_handle import DocSplitHandle
|
||||||
|
from common.handle.impl.pdf_split_handle import PdfSplitHandle
|
||||||
|
from common.handle.impl.text_split_handle import TextSplitHandle
|
||||||
from common.mixins.api_mixin import ApiMixin
|
from common.mixins.api_mixin import ApiMixin
|
||||||
from common.util.common import post
|
from common.util.common import post
|
||||||
from common.util.field_message import ErrMessage
|
from common.util.field_message import ErrMessage
|
||||||
@ -593,17 +596,22 @@ class DocumentSerializers(ApiMixin, serializers.Serializer):
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
class FileBufferHandle:
|
||||||
|
buffer = None
|
||||||
|
|
||||||
|
def get_buffer(self, file):
|
||||||
|
if self.buffer is None:
|
||||||
|
self.buffer = file.read()
|
||||||
|
return self.buffer
|
||||||
|
|
||||||
|
|
||||||
|
default_split_handle = TextSplitHandle()
|
||||||
|
split_handles = [DocSplitHandle(), PdfSplitHandle(), default_split_handle]
|
||||||
|
|
||||||
|
|
||||||
def file_to_paragraph(file, pattern_list: List, with_filter: bool, limit: int):
|
def file_to_paragraph(file, pattern_list: List, with_filter: bool, limit: int):
|
||||||
data = file.read()
|
get_buffer = FileBufferHandle().get_buffer
|
||||||
if pattern_list is not None and len(pattern_list) > 0:
|
for split_handle in split_handles:
|
||||||
split_model = SplitModel(pattern_list, with_filter, limit)
|
if split_handle.support(file, get_buffer):
|
||||||
else:
|
return split_handle.handle(file, pattern_list, with_filter, limit, get_buffer)
|
||||||
split_model = get_split_model(file.name, with_filter=with_filter, limit=limit)
|
return default_split_handle.handle(file, pattern_list, with_filter, limit, get_buffer)
|
||||||
try:
|
|
||||||
content = data.decode(chardet.detect(data)['encoding'])
|
|
||||||
except BaseException as e:
|
|
||||||
return {'name': file.name,
|
|
||||||
'content': []}
|
|
||||||
return {'name': file.name,
|
|
||||||
'content': split_model.parse(content)
|
|
||||||
}
|
|
||||||
|
|||||||
@ -31,6 +31,8 @@ langchain-openai = "^0.0.8"
|
|||||||
django-ipware = "^6.0.4"
|
django-ipware = "^6.0.4"
|
||||||
django-apscheduler = "^0.6.2"
|
django-apscheduler = "^0.6.2"
|
||||||
chardet2 = "^2.0.3"
|
chardet2 = "^2.0.3"
|
||||||
|
pymupdf = "^1.24.0"
|
||||||
|
python-docx = "^1.1.0"
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core"]
|
requires = ["poetry-core"]
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
action="#"
|
action="#"
|
||||||
:auto-upload="false"
|
:auto-upload="false"
|
||||||
:show-file-list="false"
|
:show-file-list="false"
|
||||||
accept=".txt, .md, .csv, .log"
|
accept=".txt, .md, .csv, .log, .doc, .docx, .pdf"
|
||||||
:limit="50"
|
:limit="50"
|
||||||
:on-exceed="onExceed"
|
:on-exceed="onExceed"
|
||||||
>
|
>
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user