fix: 分段时 title超过256字符将超出部分拼接给content
This commit is contained in:
parent
b93c406b19
commit
cc62c35995
@ -8,7 +8,7 @@
|
|||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
from typing import List
|
from typing import List, Dict
|
||||||
|
|
||||||
import jieba
|
import jieba
|
||||||
|
|
||||||
@ -334,7 +334,24 @@ class SplitModel:
|
|||||||
result = result_tree_to_paragraph(result_tree, [], [])
|
result = result_tree_to_paragraph(result_tree, [], [])
|
||||||
# 过滤段落内容不为空字符串的数据
|
# 过滤段落内容不为空字符串的数据
|
||||||
result = [item for item in result if 'content' in item and len(item.get('content').strip()) > 0]
|
result = [item for item in result if 'content' in item and len(item.get('content').strip()) > 0]
|
||||||
return [{**item, 'title': item.get('title').replace("#", '') if 'title' in item else ''} for item in result]
|
return [self.post_reset_paragraph(item) for item in result]
|
||||||
|
|
||||||
|
def post_reset_paragraph(self, paragraph: Dict):
|
||||||
|
result = self.filter_title_special_characters(paragraph)
|
||||||
|
result = self.sub_title(result)
|
||||||
|
return result
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def sub_title(paragraph: Dict):
|
||||||
|
if 'title' in paragraph:
|
||||||
|
title = paragraph.get('title')
|
||||||
|
if len(title) > 255:
|
||||||
|
return {**paragraph, 'title': title[0:255], 'content': title[255:len(title)] + paragraph.get('content')}
|
||||||
|
return paragraph
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def filter_title_special_characters(paragraph: Dict):
|
||||||
|
return {**paragraph, 'title': paragraph.get('title').replace("#", '') if 'title' in paragraph else ''}
|
||||||
|
|
||||||
|
|
||||||
default_split_pattern = {
|
default_split_pattern = {
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user