feat: refine regex patterns in text_split_handle for improved comment detection
--bug=1057526 --user=刘瑞斌 【知识库】markdown文件导入知识库,分段详情中代码块展示异常 https://www.tapd.cn/62980211/s/1719131
This commit is contained in:
parent
56fe631ed6
commit
e24a2001c5
@ -15,12 +15,14 @@ from charset_normalizer import detect
|
|||||||
from common.handle.base_split_handle import BaseSplitHandle
|
from common.handle.base_split_handle import BaseSplitHandle
|
||||||
from common.utils.split_model import SplitModel
|
from common.utils.split_model import SplitModel
|
||||||
|
|
||||||
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
|
default_pattern_list = [
|
||||||
|
re.compile('(?<=^)# (?!-\\*- coding:).*|(?<=\\n)# (?!-\\*- coding:).*'),
|
||||||
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
|
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
|
||||||
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
|
re.compile("(?<=\\n)(?<!#)### (?!#).*|(?<=^)(?<!#)### (?!#).*"),
|
||||||
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
|
re.compile("(?<=\\n)(?<!#)#### (?!#).*|(?<=^)(?<!#)#### (?!#).*"),
|
||||||
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
|
re.compile("(?<=\\n)(?<!#)##### (?!#).*|(?<=^)(?<!#)##### (?!#).*"),
|
||||||
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")]
|
re.compile("(?<=\\n)(?<!#)###### (?!#).*|(?<=^)(?<!#)###### (?!#).*")
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class TextSplitHandle(BaseSplitHandle):
|
class TextSplitHandle(BaseSplitHandle):
|
||||||
@ -45,11 +47,8 @@ class TextSplitHandle(BaseSplitHandle):
|
|||||||
try:
|
try:
|
||||||
content = buffer.decode(detect(buffer)['encoding'])
|
content = buffer.decode(detect(buffer)['encoding'])
|
||||||
except BaseException as e:
|
except BaseException as e:
|
||||||
return {'name': file.name,
|
return {'name': file.name, 'content': []}
|
||||||
'content': []}
|
return {'name': file.name, 'content': split_model.parse(content)}
|
||||||
return {'name': file.name,
|
|
||||||
'content': split_model.parse(content)
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_content(self, file, save_image):
|
def get_content(self, file, save_image):
|
||||||
buffer = file.read()
|
buffer = file.read()
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user