""" Optimized image classification prompts for MinerU with caching support. These prompts are designed to maximize prompt caching efficiency by: 1. Keeping static content at the beginning 2. Placing variable content at the end 3. Separating simple and context-aware versions for better cache hits """ # Base prompt for image classification (static part) IMAGE_CLASSIFICATION_BASE = """你是一个专业的文档分析助手。请根据图片内容进行分析。 # 任务要求: 请分析这张图片并输出JSON格式结果。输出需要包含以下字段: 1. type: 图片类型分类 - "structured_content": 包含可提取结构的内容(文档、表格、图表、公式等) - "brief_description": 有明确主题但无法提取结构(照片、插图等) - "meaningless": 无实质内容(装饰、分隔线等) 2. title: 图片的简短标题(10字以内) - 应该是语义化的标题,例如:"系统架构图"、"流程图"、"数据表格"、"统计图表"、"代码截图"等 3. description: 图片内容的详细描述 - 描述图片的主要内容和视觉元素 4. ocr_content: 仅对structured_content类型,提取图片中的文字内容 # 重要禁止项: - 绝对禁止输出任何 base64 编码的图片数据 - 禁止输出类似 "data:image/png;base64,..." 或 "data:image/jpeg;base64,..." 的内容 - 禁止输出任何 markdown 格式的图片标签 - 违反以上规则会导致输出无效 # 输出格式: ```json { "type": "分类类型", "title": "简短标题", "description": "详细描述", "ocr_content": "提取的文字内容(如适用)" } ```""" # Simple classification prompt (without context) IMAGE_CLASSIFICATION_SIMPLE = IMAGE_CLASSIFICATION_BASE + """ # 具体要求: - description字段请控制在50-100字以内 - 仅基于图片内容本身进行分析""" # Context-aware classification prompt (with context) IMAGE_CLASSIFICATION_CONTEXT_BASE = IMAGE_CLASSIFICATION_BASE + """ # 具体要求: - description字段请控制在100-200字以内,需要: - 解释图片与周围文本的关系 - 说明图片在文档中的作用 # 上下文信息: """ # Context template suffix (variable part - placed at the end) CONTEXT_SUFFIX_TEMPLATE = """页面位置:第 {page_idx} 页 页面类型:{page_type} {page_title_info} # 周围文本内容: {surrounding_text}""" def format_image_classification_prompt(context=None, language_code=None, has_text_content=True): """ Format the image classification prompt optimized for caching. Args: context: Optional context information dict with keys: page_idx, page_type, page_title_info, surrounding_text language_code: Optional language code for output language matching has_text_content: Whether there's text content available for language detection Returns: Formatted prompt string """ # Add language instruction based on content availability language_instruction = "" if has_text_content and language_code: # Have text content and detected language from ..language_detector import LanguageDetector language_instruction = f"\n\n# 语言要求:\n{LanguageDetector.get_language_instruction(language_code)}\n" elif not has_text_content: # No text content, use image-based language detection from ..language_detector import LanguageDetector language_instruction = f"\n\n# 语言要求:\n{LanguageDetector.get_image_language_instruction(language_code)}\n" if context: # Build context suffix context_suffix = CONTEXT_SUFFIX_TEMPLATE.format( page_idx=context.get('page_idx', 1), page_type=context.get('page_type', '未知'), page_title_info=context.get('page_title_info', ''), surrounding_text=context.get('surrounding_text', '无') ) # Return context-aware prompt with variable content at the end return IMAGE_CLASSIFICATION_CONTEXT_BASE + language_instruction + context_suffix else: # Return simple prompt with language instruction return IMAGE_CLASSIFICATION_SIMPLE + language_instruction