音视频支持分段
Some checks are pending
sync2gitee / repo-sync (push) Waiting to run
Typos Check / Spell Check with Typos (push) Waiting to run

This commit is contained in:
朱潮 2025-12-18 23:49:25 +08:00
parent 5ea9262eab
commit 77de4f6315
2 changed files with 34 additions and 24 deletions

View File

@ -880,9 +880,15 @@ class DocumentSerializers(serializers.Serializer):
@post(post_function=post_embedding)
@transaction.atomic
def save(self, instance: Dict, with_valid=False, **kwargs):
# 确保分块参数使用正确的键名
# 前端传递的是 limit, patterns, with_filter
# 我们需要将其转换为 split_limit, split_patterns, split_with_filter
# 从 self.data 中获取分块参数
if self.data.get('limit') is not None:
instance['split_limit'] = self.data.get('limit')
if self.data.get('patterns') is not None:
instance['split_patterns'] = self.data.get('patterns')
if self.data.get('with_filter') is not None:
instance['split_with_filter'] = self.data.get('with_filter')
# 同时也支持从 instance 中获取分块参数(向后兼容)
if instance.get('limit') is not None:
instance['split_limit'] = instance.get('limit')
if instance.get('patterns') is not None:
@ -1256,30 +1262,22 @@ class DocumentSerializers(serializers.Serializer):
from common.utils.logger import maxkb_logger
import os
# 添加详细日志
maxkb_logger.info(f"batch_save called with workspace_id: {workspace_id}, knowledge_id: {knowledge_id}")
maxkb_logger.info(f"instance_list contains {len(instance_list)} documents")
# 获取分块参数
split_limit = self.data.get('limit')
split_patterns = self.data.get('patterns')
split_with_filter = self.data.get('with_filter')
# 在处理文档之前,先确保所有分块参数都被正确转换
for idx, doc in enumerate(instance_list):
# 确保分块参数使用正确的键名
# 前端传递的是 limit, patterns, with_filter
# 我们需要将其转换为 split_limit, split_patterns, split_with_filter
if doc.get('limit') is not None:
doc['split_limit'] = doc.get('limit')
maxkb_logger.info(f"Document {idx}: Converting limit={doc.get('limit')} to split_limit")
if doc.get('patterns') is not None:
doc['split_patterns'] = doc.get('patterns')
maxkb_logger.info(f"Document {idx}: Converting patterns={doc.get('patterns')} to split_patterns")
if doc.get('with_filter') is not None:
doc['split_with_filter'] = doc.get('with_filter')
maxkb_logger.info(f"Document {idx}: Converting with_filter={doc.get('with_filter')} to split_with_filter")
maxkb_logger.info(f"Document {idx}: {doc.keys()}")
if 'stt_model_id' in doc:
maxkb_logger.info(f" - stt_model_id present: {doc['stt_model_id']}")
if 'llm_model_id' in doc:
maxkb_logger.info(f" - llm_model_id present: {doc['llm_model_id']}")
if split_limit is not None:
doc['split_limit'] = split_limit
if split_patterns is not None:
doc['split_patterns'] = split_patterns
if split_with_filter is not None:
doc['split_with_filter'] = split_with_filter
for document in instance_list:
# 检查是否是MinerU类型的文档需要同时有llm_model_id和vision_model_id

View File

@ -45,10 +45,22 @@ class DocumentView(APIView):
get_knowledge_operation_object(keywords.get('knowledge_id')),
{'name': r.data.get('name')}), )
def post(self, request: Request, workspace_id: str, knowledge_id: str):
# 准备分块参数
serializer_data = {
'workspace_id': workspace_id,
'knowledge_id': knowledge_id
}
# 添加分块参数到 serializer_data
if 'limit' in request.data:
serializer_data['limit'] = request.data.get('limit')
if 'patterns' in request.data:
serializer_data['patterns'] = request.data.get('patterns')
if 'with_filter' in request.data:
serializer_data['with_filter'] = request.data.get('with_filter')
return result.success(
DocumentSerializers.Create(
data={'workspace_id': workspace_id, 'knowledge_id': knowledge_id},
).save(request.data))
DocumentSerializers.Create(data=serializer_data).save(request.data))
@extend_schema(
methods=['GET'],