""" Platform-independent configuration module for MinerU-based parsing. This module provides default configuration that can be overridden by platform adapters. """ import os from typing import List, Optional from dataclasses import dataclass from .logger import get_module_logger logger = get_module_logger('config_base') @dataclass class MinerUConfig: """Configuration class for MinerU parsing system - platform independent""" # API Configuration mineru_api_key: Optional[str] = None mineru_api_url: str = "https://mineru.net" mineru_api_type: str = "cloud" # "cloud" or "self_hosted" # LLM Configuration llm_api_key: Optional[str] = None llm_api_url: str = "https://api.openai.com" llm_model: str = "gpt-4o" # Multimodal Model Configuration multimodal_api_key: Optional[str] = None multimodal_api_url: str = "https://api.openai.com" multimodal_model: str = "gpt-4-vision-preview" # File Processing Configuration max_file_size: int = 50 * 1024 * 1024 # 50MB supported_formats: List[str] = None # PPT Conversion Configuration libreoffice_path: str = "soffice" conversion_timeout: int = 300 # 5 minutes # Processing Parameters max_concurrent_uploads: int = 5 max_concurrent_api_calls: int = 3 max_image_size_mb: float = 5.0 compression_quality: int = 85 upload_max_retries: int = 3 upload_retry_delay: float = 1.0 # Batch Processing Configuration batch_processing_enabled: bool = True batch_size: int = 20 batch_processing_threshold: int = 20 # Retry Configuration api_max_retries: int = 3 api_retry_delay: float = 2.0 api_retry_backoff: float = 2.0 api_retry_max_delay: float = 30.0 retry_on_errors: List[str] = None # Context-aware Processing Configuration enable_context_extraction: bool = True context_window_size: int = 2 context_extraction_mode: str = "page" max_context_tokens: int = 1000 filter_content_types: List[str] = None use_context_for_images: bool = True # Image Processing Configuration max_images_per_page: int = 10 max_images_per_document: int = 200 filter_meaningless_images: bool = True min_image_size: int = 10000 max_image_size: int = 10000000 min_image_width: int = 100 min_image_height: int = 100 min_image_size_kb: float = 10.0 max_image_width: int = 10000 max_image_height: int = 10000 skip_recognition_for_small_images: bool = True # Multimodal Content Processing enable_multimodal_refinement: bool = True # Prompt Configuration use_enhanced_prompts: bool = True prompt_temperature: float = 0.0 # Logging Configuration log_level: str = "INFO" # Cache settings enable_cache: bool = True cache_version: str = "101" # Queue Processing Configuration queue_size: int = 50 processing_timeout: int = 7200 num_parser_threads: int = 1 num_refiner_threads: int = 1 num_recognizer_threads: int = 1 num_uploader_threads: int = 1 def __post_init__(self): """Initialize configuration from environment variables""" if self.supported_formats is None: self.supported_formats = ['pdf', 'ppt', 'pptx', 'doc', 'docx', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'tiff', 'tif', 'webp', 'svg'] if self.filter_content_types is None: self.filter_content_types = ['text', 'title'] # Load from environment variables (platform-independent) self.mineru_api_key = os.getenv('MINERU_API_KEY', self.mineru_api_key) self.mineru_api_url = os.getenv('MINERU_API_URL', self.mineru_api_url) self.mineru_api_type = os.getenv('MINERU_API_TYPE', self.mineru_api_type) self.max_file_size = int(os.getenv('MAX_FILE_SIZE', str(self.max_file_size))) self.libreoffice_path = os.getenv('LIBREOFFICE_PATH', self.libreoffice_path) self.conversion_timeout = int(os.getenv('CONVERSION_TIMEOUT', str(self.conversion_timeout))) # Batch processing self.batch_processing_enabled = os.getenv('MINERU_BATCH_PROCESSING', 'true').lower() == 'true' self.batch_size = int(os.getenv('MINERU_BATCH_SIZE', str(self.batch_size))) self.batch_processing_threshold = int(os.getenv('MINERU_BATCH_THRESHOLD', str(self.batch_processing_threshold))) # Retry configuration self.api_max_retries = int(os.getenv('MINERU_API_MAX_RETRIES', str(self.api_max_retries))) self.api_retry_delay = float(os.getenv('MINERU_API_RETRY_DELAY', str(self.api_retry_delay))) self.api_retry_backoff = float(os.getenv('MINERU_API_RETRY_BACKOFF', str(self.api_retry_backoff))) self.api_retry_max_delay = float(os.getenv('MINERU_API_RETRY_MAX_DELAY', str(self.api_retry_max_delay))) if self.retry_on_errors is None: retry_errors = os.getenv('MINERU_RETRY_ON_ERRORS', '') self.retry_on_errors = [e.strip() for e in retry_errors.split(',') if e.strip()] if retry_errors else [] # Context-aware processing self.enable_context_extraction = os.getenv('ENABLE_CONTEXT_EXTRACTION', 'true').lower() == 'true' self.context_window_size = int(os.getenv('CONTEXT_WINDOW_SIZE', str(self.context_window_size))) self.context_extraction_mode = os.getenv('CONTEXT_EXTRACTION_MODE', self.context_extraction_mode) self.max_context_tokens = int(os.getenv('MAX_CONTEXT_TOKENS', str(self.max_context_tokens))) self.use_context_for_images = os.getenv('USE_CONTEXT_FOR_IMAGES', 'true').lower() == 'true' self.use_enhanced_prompts = os.getenv('USE_ENHANCED_PROMPTS', 'true').lower() == 'true' # Image processing self.min_image_width = int(os.getenv('MINERU_MIN_IMAGE_WIDTH', str(self.min_image_width))) self.min_image_height = int(os.getenv('MINERU_MIN_IMAGE_HEIGHT', str(self.min_image_height))) self.min_image_size_kb = float(os.getenv('MINERU_MIN_IMAGE_SIZE_KB', str(self.min_image_size_kb))) self.max_image_width = int(os.getenv('MINERU_MAX_IMAGE_WIDTH', str(self.max_image_width))) self.max_image_height = int(os.getenv('MINERU_MAX_IMAGE_HEIGHT', str(self.max_image_height))) self.skip_recognition_for_small_images = os.getenv('MINERU_SKIP_SMALL_IMAGES', 'true').lower() == 'true' # Multimodal self.enable_multimodal_refinement = os.getenv('MINERU_MULTIMODAL_REFINEMENT', 'true').lower() == 'true' # Image filtering self.max_images_per_page = int(os.getenv('MINERU_MAX_IMAGES_PER_PAGE', str(self.max_images_per_page))) self.max_images_per_document = int(os.getenv('MINERU_MAX_IMAGES_PER_DOCUMENT', str(self.max_images_per_document))) self.filter_meaningless_images = os.getenv('MINERU_FILTER_MEANINGLESS', 'true').lower() == 'true' self.min_image_size = int(os.getenv('MINERU_MIN_IMAGE_SIZE', str(self.min_image_size))) self.max_image_size = int(os.getenv('MINERU_MAX_IMAGE_SIZE', str(self.max_image_size))) # Queue processing self.queue_size = int(os.getenv('MINERU_QUEUE_SIZE', str(self.queue_size))) self.processing_timeout = int(os.getenv('MINERU_PROCESSING_TIMEOUT', str(self.processing_timeout))) self.num_parser_threads = int(os.getenv('MINERU_PARSER_THREADS', str(self.num_parser_threads))) self.num_refiner_threads = int(os.getenv('MINERU_REFINER_THREADS', str(self.num_refiner_threads))) self.num_recognizer_threads = int(os.getenv('MINERU_RECOGNIZER_THREADS', str(self.num_recognizer_threads))) self.num_uploader_threads = int(os.getenv('MINERU_UPLOADER_THREADS', str(self.num_uploader_threads))) self.log_level = os.getenv('LOG_LEVEL', self.log_level) def update_from_platform(self, platform_settings: dict): """Update configuration from platform-specific settings""" if not platform_settings: return # Update API keys if provided if 'llm_api_key' in platform_settings: self.llm_api_key = platform_settings['llm_api_key'] if 'multimodal_api_key' in platform_settings: self.multimodal_api_key = platform_settings['multimodal_api_key'] # Update processing parameters for key in ['max_concurrent_uploads', 'max_concurrent_api_calls', 'max_image_size_mb', 'compression_quality', 'upload_max_retries', 'upload_retry_delay']: if key in platform_settings: setattr(self, key, platform_settings[key]) # Update cache settings if 'enable_cache' in platform_settings: self.enable_cache = platform_settings['enable_cache'] if 'cache_version' in platform_settings: self.cache_version = platform_settings['cache_version'] def validate(self) -> bool: """Validate configuration settings""" # For self-hosted MinerU, API key is not required if self.mineru_api_type == "cloud" and not self.mineru_api_key: logger.warning("MINERU_API_KEY not configured for cloud API") return False if self.max_file_size <= 0: logger.error("Invalid max_file_size configuration") return False if self.mineru_api_type not in ["cloud", "self_hosted"]: logger.error(f"Invalid mineru_api_type: {self.mineru_api_type}") return False return True # Platform-specific methods - should be overridden by adapters def get_learn_info(self, learn_type: int) -> dict: """Get learn_info configuration - override in platform adapter""" return {} def get_model_config(self, model_type: int, use_llm: bool = False) -> dict: """Get model configuration - override in platform adapter""" return {} async def call_litellm(self, model_type: int, messages: list, use_llm: bool = False, **kwargs) -> any: """Call litellm - override in platform adapter if needed""" import os import litellm # Get model config from adapter model_config = self.get_model_config(model_type, use_llm) if not model_config: raise ValueError(f"No model config for type {model_type}") # Set environment variables if 'key' in model_config and 'keyname' in model_config: os.environ[model_config['keyname']] = model_config['key'] if model_config.get('base') and model_config.get('basename'): os.environ[model_config['basename']] = model_config['base'] # Prepare call kwargs call_kwargs = {} if model_config.get('base'): call_kwargs['api_base'] = model_config['base'] call_kwargs.update(kwargs) # Handle response_format for non-GPT models if 'response_format' in call_kwargs and 'gpt' not in model_config.get('model', '').lower(): call_kwargs.pop('response_format', None) # Call litellm response = await litellm.acompletion( model=model_config['model'], messages=messages, **call_kwargs ) return response