fix: 处理文本前后的空白字符
This commit is contained in:
parent
69a0ce74b6
commit
e16e827028
@ -236,6 +236,19 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||||||
|
|
||||||
# 目录中没有前言部分,手动处理
|
# 目录中没有前言部分,手动处理
|
||||||
if handle_pre_toc:
|
if handle_pre_toc:
|
||||||
|
pre_toc = []
|
||||||
|
lines = page_content.strip().split('\n')
|
||||||
|
try:
|
||||||
|
for line in lines:
|
||||||
|
if re.match(r'^前\s*言', line):
|
||||||
|
pre_toc.append({'title': line, 'content': ''})
|
||||||
|
else:
|
||||||
|
pre_toc[-1]['content'] += line
|
||||||
|
for i in range(len(pre_toc)):
|
||||||
|
pre_toc[i]['content'] = re.sub(r'(?<!。)\n+', '', pre_toc[i]['content'])
|
||||||
|
pre_toc[i]['content'] = re.sub(r'(?<!.)\n+', '', pre_toc[i]['content'])
|
||||||
|
except BaseException as e:
|
||||||
|
max_kb.info(f'此文档没有前言部分,按照普通文本处理: {e}')
|
||||||
if pattern_list is not None and len(pattern_list) > 0:
|
if pattern_list is not None and len(pattern_list) > 0:
|
||||||
split_model = SplitModel(pattern_list, with_filter, limit)
|
split_model = SplitModel(pattern_list, with_filter, limit)
|
||||||
else:
|
else:
|
||||||
@ -243,6 +256,7 @@ class PdfSplitHandle(BaseSplitHandle):
|
|||||||
# 插入目录前的部分
|
# 插入目录前的部分
|
||||||
page_content = re.sub(r'(?<!。)\n+', '', page_content)
|
page_content = re.sub(r'(?<!。)\n+', '', page_content)
|
||||||
page_content = re.sub(r'(?<!.)\n+', '', page_content)
|
page_content = re.sub(r'(?<!.)\n+', '', page_content)
|
||||||
|
page_content = page_content.strip()
|
||||||
pre_toc = split_model.parse(page_content)
|
pre_toc = split_model.parse(page_content)
|
||||||
chapters = pre_toc + chapters
|
chapters = pre_toc + chapters
|
||||||
return chapters
|
return chapters
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user