maxkb/apps/common/handle/impl/mineru/prompts/image_classification.py
2025-08-24 17:45:40 +08:00

108 lines
4.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Optimized image classification prompts for MinerU with caching support.
These prompts are designed to maximize prompt caching efficiency by:
1. Keeping static content at the beginning
2. Placing variable content at the end
3. Separating simple and context-aware versions for better cache hits
"""
# Base prompt for image classification (static part)
IMAGE_CLASSIFICATION_BASE = """你是一个专业的文档分析助手。请根据图片内容进行分析。
# 任务要求:
请分析这张图片并输出JSON格式结果。输出需要包含以下字段
1. type: 图片类型分类
- "structured_content": 包含可提取结构的内容(文档、表格、图表、公式等)
- "brief_description": 有明确主题但无法提取结构(照片、插图等)
- "meaningless": 无实质内容(装饰、分隔线等)
2. title: 图片的简短标题10字以内
- 应该是语义化的标题,例如:"系统架构图""流程图""数据表格""统计图表""代码截图"
3. description: 图片内容的详细描述
- 描述图片的主要内容和视觉元素
4. ocr_content: 仅对structured_content类型提取图片中的文字内容
# 重要禁止项:
- 绝对禁止输出任何 base64 编码的图片数据
- 禁止输出类似 "data:image/png;base64,...""data:image/jpeg;base64,..." 的内容
- 禁止输出任何 markdown 格式的图片标签
- 违反以上规则会导致输出无效
# 输出格式:
```json
{
"type": "分类类型",
"title": "简短标题",
"description": "详细描述",
"ocr_content": "提取的文字内容(如适用)"
}
```"""
# Simple classification prompt (without context)
IMAGE_CLASSIFICATION_SIMPLE = IMAGE_CLASSIFICATION_BASE + """
# 具体要求:
- description字段请控制在50-100字以内
- 仅基于图片内容本身进行分析"""
# Context-aware classification prompt (with context)
IMAGE_CLASSIFICATION_CONTEXT_BASE = IMAGE_CLASSIFICATION_BASE + """
# 具体要求:
- description字段请控制在100-200字以内需要
- 解释图片与周围文本的关系
- 说明图片在文档中的作用
# 上下文信息:
"""
# Context template suffix (variable part - placed at the end)
CONTEXT_SUFFIX_TEMPLATE = """页面位置:第 {page_idx}
页面类型:{page_type}
{page_title_info}
# 周围文本内容:
{surrounding_text}"""
def format_image_classification_prompt(context=None, language_code=None, has_text_content=True):
"""
Format the image classification prompt optimized for caching.
Args:
context: Optional context information dict with keys:
page_idx, page_type, page_title_info, surrounding_text
language_code: Optional language code for output language matching
has_text_content: Whether there's text content available for language detection
Returns:
Formatted prompt string
"""
# Add language instruction based on content availability
language_instruction = ""
if has_text_content and language_code:
# Have text content and detected language
from ..language_detector import LanguageDetector
language_instruction = f"\n\n# 语言要求:\n{LanguageDetector.get_language_instruction(language_code)}\n"
elif not has_text_content:
# No text content, use image-based language detection
from ..language_detector import LanguageDetector
language_instruction = f"\n\n# 语言要求:\n{LanguageDetector.get_image_language_instruction(language_code)}\n"
if context:
# Build context suffix
context_suffix = CONTEXT_SUFFIX_TEMPLATE.format(
page_idx=context.get('page_idx', 1),
page_type=context.get('page_type', '未知'),
page_title_info=context.get('page_title_info', ''),
surrounding_text=context.get('surrounding_text', '')
)
# Return context-aware prompt with variable content at the end
return IMAGE_CLASSIFICATION_CONTEXT_BASE + language_instruction + context_suffix
else:
# Return simple prompt with language instruction
return IMAGE_CLASSIFICATION_SIMPLE + language_instruction