refactor: reorganize file handling imports into a structured directory
This commit is contained in:
parent
2a5cd4ca14
commit
43bef216d5
@ -9,12 +9,12 @@
|
|||||||
import io
|
import io
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import uuid_utils.compat as uuid
|
|
||||||
import zipfile
|
import zipfile
|
||||||
from typing import List
|
from typing import List
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
from django.db.models import QuerySet
|
import uuid_utils.compat as uuid
|
||||||
|
from django.utils.translation import gettext_lazy as _
|
||||||
|
|
||||||
from common.handle.base_parse_qa_handle import BaseParseQAHandle
|
from common.handle.base_parse_qa_handle import BaseParseQAHandle
|
||||||
from common.handle.impl.qa.csv_parse_qa_handle import CsvParseQAHandle
|
from common.handle.impl.qa.csv_parse_qa_handle import CsvParseQAHandle
|
||||||
@ -22,7 +22,6 @@ from common.handle.impl.qa.xls_parse_qa_handle import XlsParseQAHandle
|
|||||||
from common.handle.impl.qa.xlsx_parse_qa_handle import XlsxParseQAHandle
|
from common.handle.impl.qa.xlsx_parse_qa_handle import XlsxParseQAHandle
|
||||||
from common.utils.common import parse_md_image
|
from common.utils.common import parse_md_image
|
||||||
from knowledge.models import File
|
from knowledge.models import File
|
||||||
from django.utils.translation import gettext_lazy as _
|
|
||||||
|
|
||||||
|
|
||||||
class FileBufferHandle:
|
class FileBufferHandle:
|
||||||
|
|||||||
@ -15,7 +15,7 @@ class CsvSplitHandle(BaseParseTableHandle):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def handle(self, file, get_buffer,save_image):
|
def handle(self, file, get_buffer, save_image):
|
||||||
buffer = get_buffer(file)
|
buffer = get_buffer(file)
|
||||||
try:
|
try:
|
||||||
content = buffer.decode(detect(buffer)['encoding'])
|
content = buffer.decode(detect(buffer)['encoding'])
|
||||||
@ -41,4 +41,4 @@ class CsvSplitHandle(BaseParseTableHandle):
|
|||||||
return buffer.decode(detect(buffer)['encoding'])
|
return buffer.decode(detect(buffer)['encoding'])
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
max_kb.error(f'csv split handle error: {e}')
|
max_kb.error(f'csv split handle error: {e}')
|
||||||
return f'error: {e}'
|
return f'error: {e}'
|
||||||
|
|||||||
@ -78,7 +78,6 @@ class XlsxSplitHandle(BaseParseTableHandle):
|
|||||||
return [{'name': file.name, 'paragraphs': []}]
|
return [{'name': file.name, 'paragraphs': []}]
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def get_content(self, file, save_image):
|
def get_content(self, file, save_image):
|
||||||
try:
|
try:
|
||||||
# 加载 Excel 文件
|
# 加载 Excel 文件
|
||||||
|
|||||||
0
apps/common/handle/impl/text/__init__.py
Normal file
0
apps/common/handle/impl/text/__init__.py
Normal file
@ -10,10 +10,10 @@ import io
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import traceback
|
import traceback
|
||||||
import uuid_utils.compat as uuid
|
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
|
import uuid_utils.compat as uuid
|
||||||
from docx import Document, ImagePart
|
from docx import Document, ImagePart
|
||||||
from docx.oxml import ns
|
from docx.oxml import ns
|
||||||
from docx.table import Table
|
from docx.table import Table
|
||||||
@ -22,7 +22,6 @@ from docx.text.paragraph import Paragraph
|
|||||||
from common.handle.base_split_handle import BaseSplitHandle
|
from common.handle.base_split_handle import BaseSplitHandle
|
||||||
from common.utils.split_model import SplitModel
|
from common.utils.split_model import SplitModel
|
||||||
from knowledge.models import File
|
from knowledge.models import File
|
||||||
from django.utils.translation import gettext_lazy as _
|
|
||||||
|
|
||||||
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
||||||
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
|
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
|
||||||
@ -70,4 +70,4 @@ class HTMLSplitHandle(BaseSplitHandle):
|
|||||||
return html2text(content)
|
return html2text(content)
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
traceback.print_exception(e)
|
traceback.print_exception(e)
|
||||||
return f'{e}'
|
return f'{e}'
|
||||||
@ -15,11 +15,11 @@ import traceback
|
|||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import fitz
|
import fitz
|
||||||
|
from django.utils.translation import gettext_lazy as _
|
||||||
from langchain_community.document_loaders import PyPDFLoader
|
from langchain_community.document_loaders import PyPDFLoader
|
||||||
|
|
||||||
from common.handle.base_split_handle import BaseSplitHandle
|
from common.handle.base_split_handle import BaseSplitHandle
|
||||||
from common.utils.split_model import SplitModel
|
from common.utils.split_model import SplitModel
|
||||||
from django.utils.translation import gettext_lazy as _
|
|
||||||
|
|
||||||
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
||||||
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
|
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
|
||||||
@ -42,6 +42,7 @@ def check_links_in_pdf(doc):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
class PdfSplitHandle(BaseSplitHandle):
|
class PdfSplitHandle(BaseSplitHandle):
|
||||||
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
|
||||||
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
|
||||||
@ -181,7 +182,8 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||||||
for text in split_text:
|
for text in split_text:
|
||||||
chapters.append({"title": real_chapter_title, "content": text})
|
chapters.append({"title": real_chapter_title, "content": text})
|
||||||
else:
|
else:
|
||||||
chapters.append({"title": real_chapter_title, "content": chapter_text if chapter_text else real_chapter_title})
|
chapters.append(
|
||||||
|
{"title": real_chapter_title, "content": chapter_text if chapter_text else real_chapter_title})
|
||||||
# 保存章节内容和章节标题
|
# 保存章节内容和章节标题
|
||||||
return chapters
|
return chapters
|
||||||
|
|
||||||
@ -336,4 +338,4 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||||||
return self.handle_pdf_content(file, pdf_document)
|
return self.handle_pdf_content(file, pdf_document)
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
traceback.print_exception(e)
|
traceback.print_exception(e)
|
||||||
return f'{e}'
|
return f'{e}'
|
||||||
@ -54,7 +54,7 @@ class TextSplitHandle(BaseSplitHandle):
|
|||||||
def get_content(self, file, save_image):
|
def get_content(self, file, save_image):
|
||||||
buffer = file.read()
|
buffer = file.read()
|
||||||
try:
|
try:
|
||||||
return buffer.decode(detect(buffer)['encoding'])
|
return buffer.decode(detect(buffer)['encoding'])
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
traceback.print_exception(e)
|
traceback.print_exception(e)
|
||||||
return f'{e}'
|
return f'{e}'
|
||||||
@ -18,13 +18,13 @@ from charset_normalizer import detect
|
|||||||
from django.utils.translation import gettext_lazy as _
|
from django.utils.translation import gettext_lazy as _
|
||||||
|
|
||||||
from common.handle.base_split_handle import BaseSplitHandle
|
from common.handle.base_split_handle import BaseSplitHandle
|
||||||
from common.handle.impl.csv_split_handle import CsvSplitHandle
|
from common.handle.impl.text.csv_split_handle import CsvSplitHandle
|
||||||
from common.handle.impl.doc_split_handle import DocSplitHandle
|
from common.handle.impl.text.doc_split_handle import DocSplitHandle
|
||||||
from common.handle.impl.html_split_handle import HTMLSplitHandle
|
from common.handle.impl.text.html_split_handle import HTMLSplitHandle
|
||||||
from common.handle.impl.pdf_split_handle import PdfSplitHandle
|
from common.handle.impl.text.pdf_split_handle import PdfSplitHandle
|
||||||
from common.handle.impl.text_split_handle import TextSplitHandle
|
from common.handle.impl.text.text_split_handle import TextSplitHandle
|
||||||
from common.handle.impl.xls_split_handle import XlsSplitHandle
|
from common.handle.impl.text.xls_split_handle import XlsSplitHandle
|
||||||
from common.handle.impl.xlsx_split_handle import XlsxSplitHandle
|
from common.handle.impl.text.xlsx_split_handle import XlsxSplitHandle
|
||||||
from common.utils.common import parse_md_image
|
from common.utils.common import parse_md_image
|
||||||
from knowledge.models import File
|
from knowledge.models import File
|
||||||
|
|
||||||
@ -13,14 +13,14 @@ from rest_framework import serializers
|
|||||||
from common.db.search import native_search
|
from common.db.search import native_search
|
||||||
from common.event import ListenerManagement
|
from common.event import ListenerManagement
|
||||||
from common.exception.app_exception import AppApiException
|
from common.exception.app_exception import AppApiException
|
||||||
from common.handle.impl.csv_split_handle import CsvSplitHandle
|
from common.handle.impl.text.csv_split_handle import CsvSplitHandle
|
||||||
from common.handle.impl.doc_split_handle import DocSplitHandle
|
from common.handle.impl.text.doc_split_handle import DocSplitHandle
|
||||||
from common.handle.impl.html_split_handle import HTMLSplitHandle
|
from common.handle.impl.text.html_split_handle import HTMLSplitHandle
|
||||||
from common.handle.impl.pdf_split_handle import PdfSplitHandle
|
from common.handle.impl.text.pdf_split_handle import PdfSplitHandle
|
||||||
from common.handle.impl.text_split_handle import TextSplitHandle
|
from common.handle.impl.text.text_split_handle import TextSplitHandle
|
||||||
from common.handle.impl.xls_split_handle import XlsSplitHandle
|
from common.handle.impl.text.xls_split_handle import XlsSplitHandle
|
||||||
from common.handle.impl.xlsx_split_handle import XlsxSplitHandle
|
from common.handle.impl.text.xlsx_split_handle import XlsxSplitHandle
|
||||||
from common.handle.impl.zip_split_handle import ZipSplitHandle
|
from common.handle.impl.text.zip_split_handle import ZipSplitHandle
|
||||||
from common.utils.common import post, get_file_content
|
from common.utils.common import post, get_file_content
|
||||||
from knowledge.models import Knowledge, Paragraph, Problem, Document, KnowledgeType, ProblemParagraphMapping, State, \
|
from knowledge.models import Knowledge, Paragraph, Problem, Document, KnowledgeType, ProblemParagraphMapping, State, \
|
||||||
TaskType, File
|
TaskType, File
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user