音视频支持分段
Some checks are pending
sync2gitee / repo-sync (push) Waiting to run
Typos Check / Spell Check with Typos (push) Waiting to run

This commit is contained in:
朱潮 2025-12-18 23:57:04 +08:00
parent 77de4f6315
commit e509a078a4
2 changed files with 46 additions and 0 deletions

View File

@ -880,21 +880,37 @@ class DocumentSerializers(serializers.Serializer):
@post(post_function=post_embedding)
@transaction.atomic
def save(self, instance: Dict, with_valid=False, **kwargs):
# 添加详细日志
from common.utils.logger import maxkb_logger
maxkb_logger.info(f"=== Save Method ===")
maxkb_logger.info(f"self.data: {self.data}")
maxkb_logger.info(f"instance keys: {list(instance.keys())}")
maxkb_logger.info(f"stt_model_id in instance: {instance.get('stt_model_id')}")
maxkb_logger.info(f"llm_model_id in instance: {instance.get('llm_model_id')}")
# 从 self.data 中获取分块参数
if self.data.get('limit') is not None:
instance['split_limit'] = self.data.get('limit')
maxkb_logger.info(f"Set split_limit from self.data: {self.data.get('limit')}")
if self.data.get('patterns') is not None:
instance['split_patterns'] = self.data.get('patterns')
maxkb_logger.info(f"Set split_patterns from self.data: {self.data.get('patterns')}")
if self.data.get('with_filter') is not None:
instance['split_with_filter'] = self.data.get('with_filter')
maxkb_logger.info(f"Set split_with_filter from self.data: {self.data.get('with_filter')}")
# 同时也支持从 instance 中获取分块参数(向后兼容)
if instance.get('limit') is not None:
instance['split_limit'] = instance.get('limit')
maxkb_logger.info(f"Set split_limit from instance: {instance.get('limit')}")
if instance.get('patterns') is not None:
instance['split_patterns'] = instance.get('patterns')
maxkb_logger.info(f"Set split_patterns from instance: {instance.get('patterns')}")
if instance.get('with_filter') is not None:
instance['split_with_filter'] = instance.get('with_filter')
maxkb_logger.info(f"Set split_with_filter from instance: {instance.get('with_filter')}")
maxkb_logger.info(f"Final instance split params - split_limit: {instance.get('split_limit')}, split_patterns: {instance.get('split_patterns')}, split_with_filter: {instance.get('split_with_filter')}")
if with_valid:
DocumentInstanceSerializer(data=instance).is_valid(raise_exception=True)
@ -954,6 +970,12 @@ class DocumentSerializers(serializers.Serializer):
@staticmethod
def get_document_paragraph_model(knowledge_id, instance: Dict):
from common.utils.logger import maxkb_logger
maxkb_logger.info(f"=== get_document_paragraph_model ===")
maxkb_logger.info(f"instance split_limit: {instance.get('split_limit')}")
maxkb_logger.info(f"instance split_patterns: {instance.get('split_patterns')}")
maxkb_logger.info(f"instance split_with_filter: {instance.get('split_with_filter')}")
source_meta = {'source_file_id': instance.get('source_file_id')} if instance.get('source_file_id') else {}
# 添加MinerU模型参数到meta
if instance.get('llm_model_id'):
@ -966,13 +988,19 @@ class DocumentSerializers(serializers.Serializer):
# 添加分块参数到meta
if instance.get('split_limit') is not None:
source_meta['split_limit'] = instance.get('split_limit')
maxkb_logger.info(f"Added split_limit to source_meta: {instance.get('split_limit')}")
if instance.get('split_patterns') is not None:
source_meta['split_patterns'] = instance.get('split_patterns')
maxkb_logger.info(f"Added split_patterns to source_meta: {instance.get('split_patterns')}")
if instance.get('with_filter') is not None:
source_meta['split_with_filter'] = instance.get('with_filter')
maxkb_logger.info(f"Added split_with_filter to source_meta: {instance.get('with_filter')}")
meta = {**instance.get('meta'), **source_meta} if instance.get('meta') is not None else source_meta
meta = convert_uuid_to_str(meta)
maxkb_logger.info(f"Final meta split params - split_limit: {meta.get('split_limit')}, split_patterns: {meta.get('split_patterns')}, split_with_filter: {meta.get('split_with_filter')}")
document_model = Document(
**{
'knowledge_id': knowledge_id,

View File

@ -45,6 +45,22 @@ class DocumentView(APIView):
get_knowledge_operation_object(keywords.get('knowledge_id')),
{'name': r.data.get('name')}), )
def post(self, request: Request, workspace_id: str, knowledge_id: str):
# 添加日志
from common.utils.logger import maxkb_logger
maxkb_logger.info(f"=== Document Upload View ===")
maxkb_logger.info(f"request.data keys: {list(request.data.keys())}")
maxkb_logger.info(f"request.FILES keys: {list(request.FILES.keys())}")
if 'limit' in request.data:
maxkb_logger.info(f"request.data limit: {request.data.get('limit')}")
if 'patterns' in request.data:
maxkb_logger.info(f"request.data patterns: {request.data.get('patterns')}")
if 'with_filter' in request.data:
maxkb_logger.info(f"request.data with_filter: {request.data.get('with_filter')}")
if 'stt_model_id' in request.data:
maxkb_logger.info(f"request.data stt_model_id: {request.data.get('stt_model_id')}")
if 'llm_model_id' in request.data:
maxkb_logger.info(f"request.data llm_model_id: {request.data.get('llm_model_id')}")
# 准备分块参数
serializer_data = {
'workspace_id': workspace_id,
@ -59,6 +75,8 @@ class DocumentView(APIView):
if 'with_filter' in request.data:
serializer_data['with_filter'] = request.data.get('with_filter')
maxkb_logger.info(f"serializer_data: {serializer_data}")
return result.success(
DocumentSerializers.Create(data=serializer_data).save(request.data))