feat: 段落分块设置最小分块字数 (#898)
This commit is contained in:
parent
61024a661e
commit
16d7316dca
@ -12,6 +12,7 @@ from typing import List
|
|||||||
from common.chunk.i_chunk_handle import IChunkHandle
|
from common.chunk.i_chunk_handle import IChunkHandle
|
||||||
|
|
||||||
split_chunk_pattern = "!|。|\n|;|;"
|
split_chunk_pattern = "!|。|\n|;|;"
|
||||||
|
min_chunk_len = 20
|
||||||
|
|
||||||
|
|
||||||
class MarkChunkHandle(IChunkHandle):
|
class MarkChunkHandle(IChunkHandle):
|
||||||
@ -20,5 +21,17 @@ class MarkChunkHandle(IChunkHandle):
|
|||||||
for chunk in chunk_list:
|
for chunk in chunk_list:
|
||||||
base_chunk = re.split(split_chunk_pattern, chunk)
|
base_chunk = re.split(split_chunk_pattern, chunk)
|
||||||
base_chunk = [chunk.strip() for chunk in base_chunk if len(chunk.strip()) > 0]
|
base_chunk = [chunk.strip() for chunk in base_chunk if len(chunk.strip()) > 0]
|
||||||
result = [*result, *base_chunk]
|
result_chunk = []
|
||||||
|
for c in base_chunk:
|
||||||
|
if len(result_chunk) == 0:
|
||||||
|
result_chunk.append(c)
|
||||||
|
else:
|
||||||
|
if len(result_chunk[-1]) < min_chunk_len:
|
||||||
|
result_chunk[-1] = result_chunk[-1] + c
|
||||||
|
else:
|
||||||
|
if len(c) < min_chunk_len:
|
||||||
|
result_chunk[-1] = result_chunk[-1] + c
|
||||||
|
else:
|
||||||
|
result_chunk.append(c)
|
||||||
|
result = [*result, *result_chunk]
|
||||||
return result
|
return result
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user