fix: 修复分段时,特殊情况会丢失数据 #938 (#946)

This commit is contained in:
shaohuzhang1 2024-08-07 19:52:05 +08:00 committed by GitHub
parent e59e26208b
commit 0ad5a76598
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -27,7 +27,7 @@ def get_level_block(text, level_content_list, level_content_index, cursor):
level_content_list) else None level_content_list) else None
start_index = text.index(start_content, cursor) start_index = text.index(start_content, cursor)
end_index = text.index(next_content, start_index + 1) if next_content is not None else len(text) end_index = text.index(next_content, start_index + 1) if next_content is not None else len(text)
return text[start_index+len(start_content):end_index], end_index return text[start_index + len(start_content):end_index], end_index
def to_tree_obj(content, state='title'): def to_tree_obj(content, state='title'):
@ -303,17 +303,20 @@ class SplitModel:
level_content_list.insert(0, to_tree_obj("")) level_content_list.insert(0, to_tree_obj(""))
cursor = 0 cursor = 0
for i in range(len(level_content_list)): level_title_content_list = [item for item in level_content_list if item.get('state') == 'title']
block, cursor = get_level_block(text, level_content_list, i, cursor) for i in range(len(level_title_content_list)):
start_content: str = level_title_content_list[i].get('content')
if cursor < text.index(start_content, cursor):
level_content_list.insert(0, to_tree_obj(text[cursor: text.index(start_content, cursor)], 'block'))
block, cursor = get_level_block(text, level_title_content_list, i, cursor)
if len(block) == 0: if len(block) == 0:
level_content_list[i]['children'] = [to_tree_obj("", "block")]
continue continue
children = self.parse_to_tree(text=block, index=index + 1) children = self.parse_to_tree(text=block, index=index + 1)
level_content_list[i]['children'] = children level_title_content_list[i]['children'] = children
first_child_idx_in_block = block.lstrip().index(children[0]["content"].lstrip()) first_child_idx_in_block = block.lstrip().index(children[0]["content"].lstrip())
if first_child_idx_in_block != 0: if first_child_idx_in_block != 0:
inner_children = self.parse_to_tree(block[:first_child_idx_in_block], index + 1) inner_children = self.parse_to_tree(block[:first_child_idx_in_block], index + 1)
level_content_list[i]['children'].extend(inner_children) level_title_content_list[i]['children'].extend(inner_children)
return level_content_list return level_content_list
def parse(self, text: str): def parse(self, text: str):