fix: 【知识库】导入非utf8 编码的txt文件,分段内容是空白
This commit is contained in:
parent
f540bbe18d
commit
d732a46f89
@ -357,7 +357,7 @@ class SplitModel:
|
|||||||
default_split_pattern = {
|
default_split_pattern = {
|
||||||
'md': [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'), re.compile("(?<!#)### (?!#).*"),
|
'md': [re.compile('(?<=^)# .*|(?<=\\n)# .*'), re.compile('(?<!#)## (?!#).*'), re.compile("(?<!#)### (?!#).*"),
|
||||||
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
|
re.compile("(?<!#)#### (?!#).*"), re.compile("(?<!#)##### (?!#).*"),
|
||||||
re.compile("(?<!#)###### (?!#).*")],
|
re.compile("(?<!#)###### (?!#).*"), re.compile("(?<!\n)\n\n+")],
|
||||||
'default': [re.compile("(?<!\n)\n\n+")]
|
'default': [re.compile("(?<!\n)\n\n+")]
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -374,7 +374,7 @@ def get_split_model(filename: str, with_filter: bool = False, limit: int = 4096)
|
|||||||
pattern_list = default_split_pattern.get('md')
|
pattern_list = default_split_pattern.get('md')
|
||||||
return SplitModel(pattern_list, with_filter=with_filter, limit=limit)
|
return SplitModel(pattern_list, with_filter=with_filter, limit=limit)
|
||||||
|
|
||||||
pattern_list = default_split_pattern.get('default')
|
pattern_list = default_split_pattern.get('md')
|
||||||
return SplitModel(pattern_list, with_filter=with_filter, limit=limit)
|
return SplitModel(pattern_list, with_filter=with_filter, limit=limit)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -32,6 +32,7 @@ from dataset.models.data_set import DataSet, Document, Paragraph, Problem, Type,
|
|||||||
from dataset.serializers.common_serializers import BatchSerializer, MetaSerializer
|
from dataset.serializers.common_serializers import BatchSerializer, MetaSerializer
|
||||||
from dataset.serializers.paragraph_serializers import ParagraphSerializers, ParagraphInstanceSerializer
|
from dataset.serializers.paragraph_serializers import ParagraphSerializers, ParagraphInstanceSerializer
|
||||||
from smartdoc.conf import PROJECT_DIR
|
from smartdoc.conf import PROJECT_DIR
|
||||||
|
import chardet
|
||||||
|
|
||||||
|
|
||||||
class DocumentEditInstanceSerializer(ApiMixin, serializers.Serializer):
|
class DocumentEditInstanceSerializer(ApiMixin, serializers.Serializer):
|
||||||
@ -599,7 +600,7 @@ def file_to_paragraph(file, pattern_list: List, with_filter: bool, limit: int):
|
|||||||
else:
|
else:
|
||||||
split_model = get_split_model(file.name, with_filter=with_filter, limit=limit)
|
split_model = get_split_model(file.name, with_filter=with_filter, limit=limit)
|
||||||
try:
|
try:
|
||||||
content = data.decode('utf-8')
|
content = data.decode(chardet.detect(data)['encoding'])
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
return {'name': file.name,
|
return {'name': file.name,
|
||||||
'content': []}
|
'content': []}
|
||||||
|
|||||||
@ -30,6 +30,7 @@ html2text = "^2024.2.26"
|
|||||||
langchain-openai = "^0.0.8"
|
langchain-openai = "^0.0.8"
|
||||||
django-ipware = "^6.0.4"
|
django-ipware = "^6.0.4"
|
||||||
django-apscheduler = "^0.6.2"
|
django-apscheduler = "^0.6.2"
|
||||||
|
chardet2 = "^2.0.3"
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core"]
|
requires = ["poetry-core"]
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user