maxkb/apps/common/handle/impl/mineru/config_base.py
2025-08-24 00:56:02 +08:00

258 lines
11 KiB
Python

"""
Platform-independent configuration module for MinerU-based parsing.
This module provides default configuration that can be overridden by platform adapters.
"""
import os
from typing import List, Optional
from dataclasses import dataclass
from .logger import get_module_logger
logger = get_module_logger('config_base')
@dataclass
class MinerUConfig:
"""Configuration class for MinerU parsing system - platform independent"""
# API Configuration
mineru_api_key: Optional[str] = None
mineru_api_url: str = "https://mineru.net"
mineru_api_type: str = "cloud" # "cloud" or "self_hosted"
# LLM Configuration
llm_api_key: Optional[str] = None
llm_api_url: str = "https://api.openai.com"
llm_model: str = "gpt-4o"
# Multimodal Model Configuration
multimodal_api_key: Optional[str] = None
multimodal_api_url: str = "https://api.openai.com"
multimodal_model: str = "gpt-4-vision-preview"
# File Processing Configuration
max_file_size: int = 50 * 1024 * 1024 # 50MB
supported_formats: List[str] = None
# PPT Conversion Configuration
libreoffice_path: str = "libreoffice"
conversion_timeout: int = 300 # 5 minutes
# Processing Parameters
max_concurrent_uploads: int = 5
max_concurrent_api_calls: int = 3
max_image_size_mb: float = 5.0
compression_quality: int = 85
upload_max_retries: int = 3
upload_retry_delay: float = 1.0
# Batch Processing Configuration
batch_processing_enabled: bool = True
batch_size: int = 20
batch_processing_threshold: int = 20
# Retry Configuration
api_max_retries: int = 3
api_retry_delay: float = 2.0
api_retry_backoff: float = 2.0
api_retry_max_delay: float = 30.0
retry_on_errors: List[str] = None
# Context-aware Processing Configuration
enable_context_extraction: bool = True
context_window_size: int = 2
context_extraction_mode: str = "page"
max_context_tokens: int = 1000
filter_content_types: List[str] = None
use_context_for_images: bool = True
# Image Processing Configuration
max_images_per_page: int = 10
max_images_per_document: int = 200
filter_meaningless_images: bool = True
min_image_size: int = 10000
max_image_size: int = 10000000
min_image_width: int = 100
min_image_height: int = 100
min_image_size_kb: float = 10.0
max_image_width: int = 10000
max_image_height: int = 10000
skip_recognition_for_small_images: bool = True
# Multimodal Content Processing
enable_multimodal_refinement: bool = True
# Prompt Configuration
use_enhanced_prompts: bool = True
prompt_temperature: float = 0.0
# Logging Configuration
log_level: str = "INFO"
# Cache settings
enable_cache: bool = True
cache_version: str = "101"
# Queue Processing Configuration
queue_size: int = 50
processing_timeout: int = 600
num_parser_threads: int = 1
num_refiner_threads: int = 1
num_recognizer_threads: int = 1
num_uploader_threads: int = 1
def __post_init__(self):
"""Initialize configuration from environment variables"""
if self.supported_formats is None:
self.supported_formats = ['pdf', 'ppt', 'pptx', 'doc', 'docx',
'png', 'jpg', 'jpeg', 'gif', 'bmp',
'tiff', 'tif', 'webp', 'svg']
if self.filter_content_types is None:
self.filter_content_types = ['text', 'title']
# Load from environment variables (platform-independent)
self.mineru_api_key = os.getenv('MINERU_API_KEY', self.mineru_api_key)
self.mineru_api_url = os.getenv('MINERU_API_URL', self.mineru_api_url)
self.mineru_api_type = os.getenv('MINERU_API_TYPE', self.mineru_api_type)
self.max_file_size = int(os.getenv('MAX_FILE_SIZE', str(self.max_file_size)))
self.libreoffice_path = os.getenv('LIBREOFFICE_PATH', self.libreoffice_path)
self.conversion_timeout = int(os.getenv('CONVERSION_TIMEOUT', str(self.conversion_timeout)))
# Batch processing
self.batch_processing_enabled = os.getenv('MINERU_BATCH_PROCESSING', 'true').lower() == 'true'
self.batch_size = int(os.getenv('MINERU_BATCH_SIZE', str(self.batch_size)))
self.batch_processing_threshold = int(os.getenv('MINERU_BATCH_THRESHOLD', str(self.batch_processing_threshold)))
# Retry configuration
self.api_max_retries = int(os.getenv('MINERU_API_MAX_RETRIES', str(self.api_max_retries)))
self.api_retry_delay = float(os.getenv('MINERU_API_RETRY_DELAY', str(self.api_retry_delay)))
self.api_retry_backoff = float(os.getenv('MINERU_API_RETRY_BACKOFF', str(self.api_retry_backoff)))
self.api_retry_max_delay = float(os.getenv('MINERU_API_RETRY_MAX_DELAY', str(self.api_retry_max_delay)))
if self.retry_on_errors is None:
retry_errors = os.getenv('MINERU_RETRY_ON_ERRORS', '')
self.retry_on_errors = [e.strip() for e in retry_errors.split(',') if e.strip()] if retry_errors else []
# Context-aware processing
self.enable_context_extraction = os.getenv('ENABLE_CONTEXT_EXTRACTION', 'true').lower() == 'true'
self.context_window_size = int(os.getenv('CONTEXT_WINDOW_SIZE', str(self.context_window_size)))
self.context_extraction_mode = os.getenv('CONTEXT_EXTRACTION_MODE', self.context_extraction_mode)
self.max_context_tokens = int(os.getenv('MAX_CONTEXT_TOKENS', str(self.max_context_tokens)))
self.use_context_for_images = os.getenv('USE_CONTEXT_FOR_IMAGES', 'true').lower() == 'true'
self.use_enhanced_prompts = os.getenv('USE_ENHANCED_PROMPTS', 'true').lower() == 'true'
# Image processing
self.min_image_width = int(os.getenv('MINERU_MIN_IMAGE_WIDTH', str(self.min_image_width)))
self.min_image_height = int(os.getenv('MINERU_MIN_IMAGE_HEIGHT', str(self.min_image_height)))
self.min_image_size_kb = float(os.getenv('MINERU_MIN_IMAGE_SIZE_KB', str(self.min_image_size_kb)))
self.max_image_width = int(os.getenv('MINERU_MAX_IMAGE_WIDTH', str(self.max_image_width)))
self.max_image_height = int(os.getenv('MINERU_MAX_IMAGE_HEIGHT', str(self.max_image_height)))
self.skip_recognition_for_small_images = os.getenv('MINERU_SKIP_SMALL_IMAGES', 'true').lower() == 'true'
# Multimodal
self.enable_multimodal_refinement = os.getenv('MINERU_MULTIMODAL_REFINEMENT', 'true').lower() == 'true'
# Image filtering
self.max_images_per_page = int(os.getenv('MINERU_MAX_IMAGES_PER_PAGE', str(self.max_images_per_page)))
self.max_images_per_document = int(os.getenv('MINERU_MAX_IMAGES_PER_DOCUMENT', str(self.max_images_per_document)))
self.filter_meaningless_images = os.getenv('MINERU_FILTER_MEANINGLESS', 'true').lower() == 'true'
self.min_image_size = int(os.getenv('MINERU_MIN_IMAGE_SIZE', str(self.min_image_size)))
self.max_image_size = int(os.getenv('MINERU_MAX_IMAGE_SIZE', str(self.max_image_size)))
# Queue processing
self.queue_size = int(os.getenv('MINERU_QUEUE_SIZE', str(self.queue_size)))
self.processing_timeout = int(os.getenv('MINERU_PROCESSING_TIMEOUT', str(self.processing_timeout)))
self.num_parser_threads = int(os.getenv('MINERU_PARSER_THREADS', str(self.num_parser_threads)))
self.num_refiner_threads = int(os.getenv('MINERU_REFINER_THREADS', str(self.num_refiner_threads)))
self.num_recognizer_threads = int(os.getenv('MINERU_RECOGNIZER_THREADS', str(self.num_recognizer_threads)))
self.num_uploader_threads = int(os.getenv('MINERU_UPLOADER_THREADS', str(self.num_uploader_threads)))
self.log_level = os.getenv('LOG_LEVEL', self.log_level)
def update_from_platform(self, platform_settings: dict):
"""Update configuration from platform-specific settings"""
if not platform_settings:
return
# Update API keys if provided
if 'llm_api_key' in platform_settings:
self.llm_api_key = platform_settings['llm_api_key']
if 'multimodal_api_key' in platform_settings:
self.multimodal_api_key = platform_settings['multimodal_api_key']
# Update processing parameters
for key in ['max_concurrent_uploads', 'max_concurrent_api_calls',
'max_image_size_mb', 'compression_quality',
'upload_max_retries', 'upload_retry_delay']:
if key in platform_settings:
setattr(self, key, platform_settings[key])
# Update cache settings
if 'enable_cache' in platform_settings:
self.enable_cache = platform_settings['enable_cache']
if 'cache_version' in platform_settings:
self.cache_version = platform_settings['cache_version']
def validate(self) -> bool:
"""Validate configuration settings"""
# For self-hosted MinerU, API key is not required
if self.mineru_api_type == "cloud" and not self.mineru_api_key:
logger.warning("MINERU_API_KEY not configured for cloud API")
return False
if self.max_file_size <= 0:
logger.error("Invalid max_file_size configuration")
return False
if self.mineru_api_type not in ["cloud", "self_hosted"]:
logger.error(f"Invalid mineru_api_type: {self.mineru_api_type}")
return False
return True
# Platform-specific methods - should be overridden by adapters
def get_learn_info(self, learn_type: int) -> dict:
"""Get learn_info configuration - override in platform adapter"""
return {}
def get_model_config(self, model_type: int, use_llm: bool = False) -> dict:
"""Get model configuration - override in platform adapter"""
return {}
async def call_litellm(self, model_type: int, messages: list, use_llm: bool = False, **kwargs) -> any:
"""Call litellm - override in platform adapter if needed"""
import os
import litellm
# Get model config from adapter
model_config = self.get_model_config(model_type, use_llm)
if not model_config:
raise ValueError(f"No model config for type {model_type}")
# Set environment variables
if 'key' in model_config and 'keyname' in model_config:
os.environ[model_config['keyname']] = model_config['key']
if model_config.get('base') and model_config.get('basename'):
os.environ[model_config['basename']] = model_config['base']
# Prepare call kwargs
call_kwargs = {}
if model_config.get('base'):
call_kwargs['api_base'] = model_config['base']
call_kwargs.update(kwargs)
# Handle response_format for non-GPT models
if 'response_format' in call_kwargs and 'gpt' not in model_config.get('model', '').lower():
call_kwargs.pop('response_format', None)
# Call litellm
response = await litellm.acompletion(
model=model_config['model'],
messages=messages,
**call_kwargs
)
return response