add mineru

2025-08-24 17:45:40 +08:00 · 2025-08-24 17:45:40 +08:00 · f0263bf189
commit f0263bf189
parent 35f9a4dbfe
20 changed files with 845 additions and 184 deletions
--- a/MINERU_STORAGE_README.md
+++ b/MINERU_STORAGE_README.md
@ -0,0 +1,61 @@
+# MinerU 图片存储配置说明
+
+## 访问URL格式
+
+MinerU解析后的图片访问URL格式为：
+```
+http://localhost:8080/storage/mineru/images/xxx.jpg
+```
+
+简洁直接，直接使用 `/storage/` 路径访问
+
+## 存储路径配置
+
+### 本地开发环境
+- 存储路径：`./tmp/maxkb/storage/`
+- 图片位置：`./tmp/maxkb/storage/mineru/images/`
+
+### Docker环境
+- 容器内路径：`/opt/maxkb/storage/`
+- 本地映射路径：`~/.maxkb/storage/`
+- 图片位置：`~/.maxkb/storage/mineru/images/`
+
+## 环境变量配置
+
+在 `.env` 文件或 docker-compose.yml 中添加：
+```bash
+MAXKB_STORAGE_PATH=/opt/maxkb/storage
+```
+
+## Docker Volume配置
+
+在 `docker-compose.yml` 中已配置：
+```yaml
+volumes:
+  - ~/.maxkb/storage:/opt/maxkb/storage:rw
+```
+
+## 测试访问
+
+1. 运行测试脚本创建测试图片：
+```bash
+python test_image_access.py
+```
+
+2. 访问测试URL：
+```
+http://localhost:8080/storage/mineru/images/ac3681aaa7a346b49ef9c7ceb7b94058.jpg
+```
+
+## 故障排查
+
+1. **404错误**：检查文件是否存在于存储目录
+2. **权限错误**：确保存储目录有写入权限
+3. **路径错误**：确认URL路径以 `/storage/` 开头
+
+## 相关文件
+
+- 存储视图：`apps/oss/views/storage.py`
+- URL配置：`apps/oss/urls.py`
+- MinerU适配器：`apps/common/handle/impl/mineru/maxkb_adapter/adapter.py`
+- 配置文件：`apps/common/handle/impl/mineru/maxkb_adapter/config_maxkb.py`
--- a/apps/common/handle/impl/mineru/base_parser.py
+++ b/apps/common/handle/impl/mineru/base_parser.py
@ -268,12 +268,35 @@ class BaseMinerUExtractor:
                self.logger.info(f"mineru-parser: found {len(cached_pages)} cached pages")
            
            # 获取上传回调（通过适配器）
-            upload_callback = lambda file_path: self.adapter.upload_file(file_path, upload_options)
+            # 总是创建upload_callback，让适配器决定如何处理
+            # upload_func 需要接受4个参数: filepath, filename, upload_options, binary_data
+            async def upload_callback(filepath, filename, options, binary_data=None):
+                # 如果有 binary_data，说明图片被压缩了，需要先保存到临时文件
+                if binary_data:
+                    import tempfile
+                    with tempfile.NamedTemporaryFile(suffix=os.path.splitext(filename)[1], delete=False) as tmp:
+                        tmp.write(binary_data)
+                        tmp_path = tmp.name
+                    
+                    result = await self.adapter.upload_file(tmp_path, upload_options)
+                    # 清理临时文件
+                    try:
+                        os.unlink(tmp_path)
+                    except:
+                        pass
+                elif filepath:
+                    result = await self.adapter.upload_file(filepath, upload_options)
+                else:
+                    self.logger.warning(f"No file path or binary data provided for {filename}")
+                    return None, None
+                
+                # 返回 (url, upload_key) 格式
+                return result, None
            
            # 并行处理文档
            completed_tasks = await self.parallel_processor.process_document_with_cache(
                pdf_path, temp_dir, src_fileid, is_ppt_format,
-                len(pages_info), upload_callback if upload_options else None, upload_options,
+                len(pages_info), upload_callback, upload_options,
                cached_pages=cached_pages,
                save_callback=lambda idx, data: self._save_page_cache(temp_dir, idx, data)
            )
--- a/apps/common/handle/impl/mineru/gbase_adapter/adapter.py
+++ b/apps/common/handle/impl/mineru/gbase_adapter/adapter.py
@ -45,6 +45,11 @@ class GPTBaseAdapter(PlatformAdapter):
    
    async def upload_file(self, file_path: str, options: Any = None) -> str:
        """上传文件 - PDF使用S3特殊处理，图片使用gzero上传"""
+        # 如果在测试模式下，直接返回原图地址
+        if os.getenv('MINERU_TEST_FILE'):
+            logger.info(f"Gbase: Test mode - returning original path: {file_path}")
+            return file_path 
+        
        import os
        
        # 判断文件类型
--- a/apps/common/handle/impl/mineru/image_optimizer.py
+++ b/apps/common/handle/impl/mineru/image_optimizer.py
@ -224,10 +224,10 @@ class ImageOptimizer:
                                   upload_options,
                                   max_retries: int = 3,
                                   retry_delay: float = 1.0) -> Tuple[Optional[str], Optional[str]]:
-        if os.getenv('MINERU_TEST_FILE'):
-            return image_info.filepath, None
-        # return image_info.filepath, None
        """上传单个图片（带并发控制和重试机制）"""
+        # 注释掉测试模式，让上传回调能够被调用
+        # if os.getenv('MINERU_TEST_FILE'):
+        #     return image_info.filepath, None
        async with self.upload_semaphore:
            # 处理图片
            image_data, hash_value = await self.process_image_for_upload(image_info)
@ -344,8 +344,9 @@ class ImageOptimizer:
        async with self.api_semaphore:
            try:
                # 调用分类函数
+                # classify_func expects: (learn_type, image_filepath, temp_dir, src_name, hint)
                return await classify_func(
-                    vision_model,
+                    vision_model,  # This is actually learn_type
                    img_data['image_info'].filepath,
                    temp_dir,
                    src_name,
--- a/apps/common/handle/impl/mineru/image_processor.py
+++ b/apps/common/handle/impl/mineru/image_processor.py
@ -517,11 +517,16 @@ class MinerUImageProcessor:
        
        This is an enhanced version that uses context when available.
        """
+        self.logger.info(f"mineru-image: _classify_single_image_with_context called for {os.path.basename(image_filepath)}")
+        
        # If no context, fall back to original method
        if not context:
+            self.logger.info(f"mineru-image: no context, falling back to original method")
            return await self._classify_single_image(learn_type, image_filepath, temp_dir, src_name, hint)
        
        try:
+            self.logger.info(f"mineru-image: processing with context for {os.path.basename(image_filepath)}")
+            
            if not os.path.exists(image_filepath):
                raise FileNotFoundError(f"Image file not found: {image_filepath}")
            
@ -535,6 +540,9 @@ class MinerUImageProcessor:
            # Build context-aware prompt with language
            prompt = self._build_context_aware_prompt(context, language_code)
            
+            # Log the final prompt for debugging
+            self.logger.info(f"mineru-image: Final prompt for {os.path.basename(image_filepath)}:\n{prompt[:1000]}...")
+            
            messages = [
                {'role': 'system', 'content': prompt},
                {'role': 'user', 'content': [
@ -549,19 +557,34 @@ class MinerUImageProcessor:
            start_time = time.time()
            
            try:
+                self.logger.info(f"mineru-image: calling vision model for {os.path.basename(image_filepath)}")
                response = await self.config.call_litellm(
                    model_type=learn_type,
                    messages=messages,
                    temperature=0.0,
                    timeout=120.0  # Increased timeout to 120 seconds for vision models
                )
+                self.logger.info(f"mineru-image: received response from vision model")
                
                duration = time.time() - start_time
                
+                # Log raw response for debugging
+                raw_response = response.choices[0].message.content if response.choices else ""
+                self.logger.info(f"mineru-image: raw AI response (first 500 chars): {raw_response[:500] if raw_response else 'Empty response'}")
+                # Log complete response for debugging
+                self.logger.info(f"mineru-image: FULL AI response for {os.path.basename(image_filepath)}:\n{raw_response}")
+                
+                # Log usage info
+                if hasattr(response, 'usage'):
+                    self.logger.info(f"mineru-image: usage - prompt_tokens={getattr(response.usage, 'prompt_tokens', 0)}, "
+                                   f"completion_tokens={getattr(response.usage, 'completion_tokens', 0)}")
+                else:
+                    self.logger.warning(f"mineru-image: no usage info in response")
+                
                # Parse enhanced response
                result = self._parse_context_aware_response(
-                    response.choices[0].message.content,
-                    response.usage,
+                    raw_response,
+                    response.usage if hasattr(response, 'usage') else None,
                    duration
                )
                
@ -569,15 +592,21 @@ class MinerUImageProcessor:
                result['has_context'] = True
                result['page_idx'] = context.page_idx
                
+                # Log successful classification
+                self.logger.info(f"mineru-image: classified {os.path.basename(image_filepath)} as {result.get('type', 'unknown')} "
+                                f"(tokens: in={result.get('input_tokens', 0)}, out={result.get('output_tokens', 0)})")
+                
            except Exception as e:
                self.logger.error(f"mineru-image: classification error: {str(e)}")
+                self.logger.info(f"mineru-image: classification failed for {os.path.basename(image_filepath)}, returning meaningless")
                result = {
                    'type': 'meaningless',
                    'content': f'Classification error: {str(e)}',
                    'input_tokens': 0,
                    'output_tokens': 0,
                    'dura': time.time() - start_time,
-                    'has_context': True
+                    'has_context': True,
+                    'error': str(e)
                }
            
            return result
@ -652,13 +681,16 @@ class MinerUImageProcessor:
            # Parse JSON
            result_json = json.loads(response_content)
            
+            # Log the raw classification response for debugging
+            self.logger.info(f"mineru-image: parsed JSON response: {result_json}")
+            
            # Build result dictionary
            result = {
                'type': result_json.get('type', 'meaningless'),
                'title': result_json.get('title', ''),
                'content': result_json.get('description', ''),
-                'input_tokens': usage.prompt_tokens if hasattr(usage, 'prompt_tokens') else 0,
-                'output_tokens': usage.completion_tokens if hasattr(usage, 'completion_tokens') else 0,
+                'input_tokens': usage.prompt_tokens if usage and hasattr(usage, 'prompt_tokens') else 0,
+                'output_tokens': usage.completion_tokens if usage and hasattr(usage, 'completion_tokens') else 0,
                'dura': duration
            }
            
@ -670,13 +702,14 @@ class MinerUImageProcessor:
            
        except Exception as e:
            self.logger.error(f"mineru-image: failed to parse context response: {str(e)}")
+            self.logger.debug(f"mineru-image: response that failed to parse: {response_content[:500] if response_content else 'Empty'}")
            # Return a basic result
            return {
                'type': 'brief_description',
                'title': '',
                'content': response_content[:200] if response_content else '',
-                'input_tokens': usage.prompt_tokens if hasattr(usage, 'prompt_tokens') else 0,
-                'output_tokens': usage.completion_tokens if hasattr(usage, 'completion_tokens') else 0,
+                'input_tokens': usage.prompt_tokens if usage and hasattr(usage, 'prompt_tokens') else 0,
+                'output_tokens': usage.completion_tokens if usage and hasattr(usage, 'completion_tokens') else 0,
                'dura': duration
            }
    
@ -736,6 +769,9 @@ class MinerUImageProcessor:
                # Parse response
                response_content = response.choices[0].message.content
                
+                # Log complete response for debugging
+                self.logger.info(f"mineru-image: FULL AI response for {os.path.basename(image_filepath)} (no context):\n{response_content}")
+                
                # Extract JSON from markdown code block if present
                if '```json' in response_content and '```' in response_content:
                    try:
@ -797,7 +833,8 @@ class MinerUImageProcessor:
                    'dura': time.time() - start_time,
                }
            
-            self.logger.info(f"mineru-image: classified {image_filepath} as {result.get('type', 'unknown')}")
+            # Enhanced logging to debug meaningless classification
+            self.logger.info(f"mineru-image: classified {image_filepath} as {result.get('type', 'unknown')} - tokens: in={result.get('input_tokens', 0)}, out={result.get('output_tokens', 0)}, error={result.get('error', 'None')}")
            
            return result
            
--- a/apps/common/handle/impl/mineru/logger.py
+++ b/apps/common/handle/impl/mineru/logger.py
@ -83,7 +83,10 @@ def get_module_logger(module_name):
            pass
    
    # 默认情况，创建独立的logger
-    return logging.getLogger(module_name)
+    module_logger = logging.getLogger(module_name)
+    # 确保使用正确的日志级别
+    module_logger.setLevel(logging.INFO)
+    return module_logger

 # 为了兼容性，导出默认logger
 logger = get_module_logger('mineru')
--- a/apps/common/handle/impl/mineru/maxkb_adapter/adapter.py
+++ b/apps/common/handle/impl/mineru/maxkb_adapter/adapter.py
@ -30,6 +30,11 @@ class MaxKBAdapter(PlatformAdapter):
        self.file_storage = FileStorageClient()
        self.model_client = maxkb_model_client
        
+        # 导入配置以获取存储路径
+        from .config_maxkb import MaxKBMinerUConfig
+        self.config = MaxKBMinerUConfig()
+        self.storage_path = self.config.file_storage_path
+    
    @contextlib.asynccontextmanager
    async def trace_context(self, trace_id: str):
        """MaxKB的trace上下文 - 如果没有特殊实现，使用简单的上下文"""
@ -51,19 +56,68 @@ class MaxKBAdapter(PlatformAdapter):
        logger.debug(f"MaxKB: Released lock for {temp_dir}")
    
    async def upload_file(self, file_path: str, options: Any = None) -> str:
-        """使用MaxKB的文件存储上传文件"""
+        """使用MaxKB的文件存储上传文件 - 直接复制文件到存储目录"""
+        import shutil
+        import uuid
+        
+        logger.info(f"MaxKB: upload_file called with path={file_path}, options={options}")
+        
        # 如果在测试模式下，直接返回原图地址
-        if os.getenv('MINERU_TEST_FILE'):
-            logger.info(f"MaxKB: Test mode - returning original path: {file_path}")
-            return file_path
+        #if os.getenv('MINERU_TEST_FILE'):
+        #    logger.info(f"MaxKB: Test mode - returning original path: {file_path}")
+        #    return file_path
        
        try:
-            # 使用MaxKB的文件存储客户端上传
-            uploaded_url = await self.file_storage.upload_image(file_path)
-            return uploaded_url
+            # 确保文件存在
+            if not os.path.exists(file_path):
+                logger.warning(f"MaxKB: File not found: {file_path}")
+                return file_path
+            
+            # 获取knowledge_id（如果在options中提供）
+            knowledge_id = None
+            if options and isinstance(options, (tuple, list)) and len(options) > 0:
+                knowledge_id = options[0]
+            
+            # 创建存储目录结构
+            # 使用 knowledge_id 或 'mineru' 作为子目录
+            sub_dir = knowledge_id if knowledge_id else 'mineru'
+            storage_dir = os.path.join(self.storage_path, sub_dir, 'images')
+            
+            # 确保存储目录存在
+            os.makedirs(storage_dir, exist_ok=True)
+            
+            # 生成唯一的文件名，保留原始扩展名
+            file_ext = os.path.splitext(file_path)[1]
+            file_name = f"{uuid.uuid4().hex}{file_ext}"
+            dest_path = os.path.join(storage_dir, file_name)
+            
+            # 复制文件到存储目录
+            shutil.copy2(file_path, dest_path)
+            
+            # 返回相对路径或URL格式
+            # 生成相对于storage根目录的路径
+            relative_path = os.path.relpath(dest_path, self.storage_path)
+            # 确保路径使用正斜杠（兼容所有系统）
+            relative_path = relative_path.replace(os.path.sep, '/')
+            
+            # 根据环境配置生成完整的URL
+            # 检查是否配置了基础URL
+            base_url = os.getenv('MAXKB_BASE_URL', '')
+            if base_url:
+                # 如果有基础URL，生成完整的URL
+                result_url = f"{base_url.rstrip('/')}/storage/{relative_path}"
+            else:
+                # 生成相对URL，直接使用/storage/路径
+                result_url = f"/storage/{relative_path}"
+            
+            logger.info(f"MaxKB: Copied file {file_path} -> {dest_path}")
+            logger.debug(f"MaxKB: Returning URL: {result_url}")
+            
+            return result_url
+            
        except Exception as e:
-            logger.error(f"MaxKB: Failed to upload file {file_path}: {str(e)}")
-            # 如果上传失败，返回本地路径
+            logger.error(f"MaxKB: Failed to copy file {file_path}: {str(e)}")
+            # 如果复制失败，返回本地路径
            return file_path
    
    def get_logger(self):
--- a/apps/common/handle/impl/mineru/maxkb_adapter/config_maxkb.py
+++ b/apps/common/handle/impl/mineru/maxkb_adapter/config_maxkb.py
@ -28,7 +28,7 @@ class MaxKBMinerUConfig(MinerUConfig):
        
        # File storage settings
        self.file_storage_type = os.getenv('MAXKB_STORAGE_TYPE', 'local')  # local, s3, oss
-        self.file_storage_path = os.getenv('MAXKB_STORAGE_PATH', '/tmp/maxkb/storage')
+        self.file_storage_path = os.getenv('MAXKB_STORAGE_PATH', '/opt/maxkb/storage')
        self.file_storage_bucket = os.getenv('MAXKB_STORAGE_BUCKET')
        
        # Model client settings
@ -133,14 +133,23 @@ class MaxKBMinerUConfig(MinerUConfig):
            
            # Call appropriate method based on content type
            if has_images:
-                # Extract image and text for vision model
+                # Extract image and combine all text content for vision model
                image_path = None
-                prompt = ""
+                combined_prompt = ""
+                
+                # First, collect system message if exists
                for msg in messages:
+                    if msg.get('role') == 'system':
+                        combined_prompt = msg.get('content', '') + "\n\n"
+                        break
+                
+                # Then extract user message content
+                for msg in messages:
+                    if msg.get('role') == 'user':
                        if isinstance(msg.get('content'), list):
                            for content_item in msg['content']:
                                if content_item.get('type') == 'text':
-                                prompt = content_item.get('text', '')
+                                    combined_prompt += content_item.get('text', '')
                                elif content_item.get('type') == 'image_url':
                                    image_url = content_item.get('image_url', {})
                                    if isinstance(image_url, dict):
@ -158,12 +167,14 @@ class MaxKBMinerUConfig(MinerUConfig):
                                                image_path = tmp.name
                                        else:
                                            image_path = url
+                        elif isinstance(msg.get('content'), str):
+                            combined_prompt += msg.get('content', '')
                
                if image_path:
                    response_text = await maxkb_model_client.vision_completion(
                        model_id=model_id,
                        image_path=image_path,
-                        prompt=prompt,
+                        prompt=combined_prompt,
                        **kwargs
                    )
                else:
@ -215,8 +226,14 @@ class MaxKBMinerUConfig(MinerUConfig):
                        'total_tokens': 0
                    })()
            
-            # Return empty response on error to continue processing
-            return MockResponse("")
+            # Return a valid JSON response on error to prevent parsing issues
+            # This will be parsed as a brief_description type
+            error_response = json.dumps({
+                "type": "brief_description",
+                "title": "Error",
+                "description": f"Model call failed: {str(e)}"
+            })
+            return MockResponse(error_response)
    
    def _get_default_llm_model_id(self) -> str:
        """获取默认的LLM模型ID"""
--- a/apps/common/handle/impl/mineru/maxkb_adapter/maxkb_model_client.py
+++ b/apps/common/handle/impl/mineru/maxkb_adapter/maxkb_model_client.py
@ -187,8 +187,13 @@ class MaxKBModelClient:
            llm_model = await self.get_llm_model(model_id)
            
            if not llm_model:
-                self.logger.warning(f"No model available for {model_id}, returning empty response")
-                return ""
+                self.logger.warning(f"No model available for {model_id}, returning error JSON")
+                import json
+                return json.dumps({
+                    "type": "brief_description",
+                    "title": "No Model",
+                    "description": "LLM model not available"
+                })
            
            # 调用模型 - 使用 sync_to_async 包装同步调用
            response = await sync_to_async(llm_model.invoke)(messages)
@ -203,8 +208,13 @@ class MaxKBModelClient:
                
        except Exception as e:
            self.logger.error(f"Chat completion failed: {str(e)}")
-            # 返回空字符串而不是抛出异常，让处理继续
-            return ""
+            # 返回错误JSON而不是空字符串
+            import json
+            return json.dumps({
+                "type": "brief_description",
+                "title": "Error",
+                "description": f"Chat completion failed: {str(e)}"
+            })
    
    async def vision_completion(self, model_id: str, image_path: str, prompt: str, **kwargs) -> str:
        """
@ -224,16 +234,48 @@ class MaxKBModelClient:
            vision_model = await self.get_vision_model(model_id)
            
            if not vision_model:
-                self.logger.warning(f"No vision model available for {model_id}, returning empty response")
-                return ""
+                self.logger.warning(f"No vision model available for {model_id}, returning error JSON")
+                # Return a valid JSON response instead of empty string
+                import json
+                return json.dumps({
+                    "type": "brief_description",
+                    "title": "No Model",
+                    "description": "Vision model not available"
+                })
            
-            # 构造消息
+            # 读取图片并转换为base64
+            import base64
+            import os
+            
+            if not os.path.exists(image_path):
+                self.logger.error(f"Image file not found: {image_path}")
+                import json
+                return json.dumps({
+                    "type": "brief_description",
+                    "title": "File Error",
+                    "description": f"Image file not found: {image_path}"
+                })
+            
+            try:
+                with open(image_path, 'rb') as img_file:
+                    image_data = img_file.read()
+                    image_base64 = base64.b64encode(image_data).decode('utf-8')
+            except Exception as e:
+                self.logger.error(f"Failed to read/encode image {image_path}: {str(e)}")
+                import json
+                return json.dumps({
+                    "type": "brief_description",
+                    "title": "Image Error",
+                    "description": f"Failed to read/encode image: {str(e)}"
+                })
+            
+            # 构造消息 - 使用base64编码的图片
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
-                        {"type": "image_url", "image_url": {"url": f"file://{image_path}"}}
+                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}}
                    ]
                }
            ]
@ -251,8 +293,13 @@ class MaxKBModelClient:
                
        except Exception as e:
            self.logger.error(f"Vision completion failed: {str(e)}")
-            # 返回空字符串而不是抛出异常，让处理继续
-            return ""
+            # 返回错误JSON而不是空字符串
+            import json
+            return json.dumps({
+                "type": "brief_description",
+                "title": "Vision Error",
+                "description": f"Vision completion failed: {str(e)}"
+            })
    
    async def batch_chat_completion(self, model_id: str, batch_messages: List[List[Dict]], **kwargs) -> List[str]:
        """
--- a/apps/common/handle/impl/mineru/parallel_processor.py
+++ b/apps/common/handle/impl/mineru/parallel_processor.py
@ -610,6 +610,16 @@ class ParallelMinerUProcessor:
                                xref = image_info.xref
                                if xref in classification_results:
                                    result = classification_results[xref]
+                                else:
+                                    # No classification result - likely an error occurred
+                                    self.logger.warning(f"Recognizer: no classification result for {image_info.filename}, creating default result")
+                                    result = {
+                                        'type': 'meaningless',
+                                        'content': 'Classification failed - no result returned',
+                                        'input_tokens': 0,
+                                        'output_tokens': 0,
+                                        'error': 'No classification result'
+                                    }
                                
                                # Apply meaningless filter if configured
                                if self.config.filter_meaningless_images and result.get('type') == 'meaningless':
@ -977,25 +987,24 @@ class ParallelMinerUProcessor:
                            f"images_count={len(task.images)}")
            
            if has_content and (has_no_images or has_processed_images):
-                # Integrate image descriptions into content before marking complete
-                if task.processed_images and task.image_descriptions:
-                    self.logger.info(f"Page {task.page_idx + 1} ready for image integration:")
-                    self.logger.info(f"  - processed_images: {list(task.processed_images.keys())}")
+                # Integrate images into content if we have any image descriptions
+                # This ensures meaningless images are properly removed from content
+                if task.image_descriptions:
+                    self.logger.info(f"Page {task.page_idx + 1} processing image integration:")
+                    self.logger.info(f"  - processed_images: {list(task.processed_images.keys()) if task.processed_images else 'None (filtered out)'}")
                    self.logger.info(f"  - image_descriptions: {list(task.image_descriptions.keys())}")
                    self.logger.info(f"  - content length before: {len(task.refined_content)} chars")
                    
                    task.refined_content = self._integrate_images_into_content(
                        task.refined_content,
                        task.image_descriptions,
-                        task.processed_images,
+                        task.processed_images or {},  # Pass empty dict if None
                        f"{task.src_fileid}_page_{task.page_idx}"
                    )
                    
                    self.logger.info(f"  - content length after: {len(task.refined_content)} chars")
                else:
-                    self.logger.info(f"Page {task.page_idx + 1} has no images to integrate: "
-                                   f"processed_images={bool(task.processed_images)}, "
-                                   f"image_descriptions={bool(task.image_descriptions)}")
+                    self.logger.info(f"Page {task.page_idx + 1} has no images to process")
                
                task.status = TaskStatus.COMPLETED
                should_mark_complete = True
@ -1032,8 +1041,23 @@ class ParallelMinerUProcessor:
            # Process each image description
            for filename, desc_info in image_descriptions.items():
                self.logger.info(f"\nChecking image {filename} for replacement")
-                if filename in uploaded_images:
-                    uploaded_url = uploaded_images[filename]
+                
+                # Get image type to determine if it's meaningless
+                img_type = desc_info.get('type', 'brief_description')
+                
+                # Process ALL images that have descriptions
+                # - Meaningless images: remove references (replace with empty string)
+                # - Images not uploaded but classified: also remove (likely filtered)
+                # - Uploaded images: replace with proper markdown
+                uploaded_url = uploaded_images.get(filename, '')
+                
+                if img_type == 'meaningless':
+                    self.logger.info(f"  - Image is meaningless, will remove references")
+                elif filename not in uploaded_images:
+                    self.logger.info(f"  - Image was classified as {img_type} but not uploaded (filtered), will remove references")
+                    # Treat as meaningless for removal purposes
+                    img_type = 'meaningless'
+                else:
                    self.logger.info(f"  - Found in uploaded_images: {uploaded_url}")
                
                # Extract the hash part from the filename
@ -1057,15 +1081,15 @@ class ParallelMinerUProcessor:
                        ref = base_filename
                
                # Build replacement content based on image type
-                    img_type = desc_info.get('type', 'brief_description')
+                # img_type already extracted above
                title = desc_info.get('title', '')
                description = desc_info.get('content', '')
                ocr_content = desc_info.get('ocr_content', '')
                
                # Create the replacement markdown
                if img_type == 'meaningless':
-                        # Skip meaningless images
-                        continue
+                    # For meaningless images, we want to remove them entirely
+                    replacement = ""  # Empty string to remove the image reference
                elif img_type == 'structured_content':
                    # For structured content, include full description
                    replacement = f"\n\n![{title}]({uploaded_url})\n<!--{description}-->\n\n{ocr_content}\n\n"
@ -1120,8 +1144,6 @@ class ParallelMinerUProcessor:
                    self.logger.info(f"Tried patterns:")
                    for p in patterns:
                        self.logger.info(f"  - {p}")
-                else:
-                    self.logger.warning(f"Image {filename} not found in uploaded_images!")
                        
            return enhanced_content
            
--- a/apps/common/handle/impl/mineru/prompts/image_classification.py
+++ b/apps/common/handle/impl/mineru/prompts/image_classification.py
@ -56,18 +56,6 @@ IMAGE_CLASSIFICATION_CONTEXT_BASE = IMAGE_CLASSIFICATION_BASE + """
 - description字段请控制在100-200字以内，需要：
  - 解释图片与周围文本的关系
  - 说明图片在文档中的作用
- context_relevance 字段，表示图片与上下文的相关性（high/medium/low）
-
-# 输出格式：
-```json
-{
-   "type": "分类类型",
-   "title": "简短标题",
-   "description": "详细描述",
-   "ocr_content": "提取的文字内容（如适用）",
-   "context_relevance": "相关性等级"
-}
-```

 # 上下文信息：
 """
--- a/apps/maxkb/urls.py
+++ b/apps/maxkb/urls.py
@ -72,6 +72,65 @@ def pro():
 if not settings.DEBUG:
    pro()

+# 添加storage路由 - 使用函数视图避免类导入问题
+def serve_storage_file(request, file_path):
+    """
+    直接提供storage目录下的文件访问
+    """
+    import os
+    import mimetypes
+    from django.http import HttpResponse, Http404
+    from django.utils.encoding import escape_uri_path
+    
+    # 基础存储路径 - 支持本地开发和Docker环境
+    base_path = os.getenv('MAXKB_STORAGE_PATH', '/opt/maxkb/storage')
+    # 如果是本地开发环境，使用相对路径
+    if not os.path.exists(base_path):
+        base_path = './tmp/maxkb/storage'
+    full_path = os.path.join(base_path, file_path)
+    
+    # 安全检查
+    try:
+        real_base = os.path.realpath(base_path)
+        real_path = os.path.realpath(full_path)
+        if not real_path.startswith(real_base):
+            raise Http404("File not found")
+    except (OSError, ValueError):
+        raise Http404("File not found")
+    
+    # 检查文件是否存在
+    if not os.path.exists(full_path) or not os.path.isfile(full_path):
+        raise Http404("File not found")
+    
+    # 读取文件
+    try:
+        with open(full_path, 'rb') as f:
+            file_content = f.read()
+    except IOError:
+        raise Http404("File not found")
+    
+    # 获取MIME类型
+    content_type, _ = mimetypes.guess_type(full_path)
+    if not content_type:
+        content_type = 'application/octet-stream'
+    
+    # 构建响应
+    response = HttpResponse(file_content, content_type=content_type)
+    
+    # 设置响应头
+    file_name = os.path.basename(full_path)
+    if content_type.startswith('image/'):
+        response['Content-Disposition'] = f'inline; filename="{escape_uri_path(file_name)}"'
+        response['Cache-Control'] = 'public, max-age=2592000'  # 30天缓存
+    else:
+        response['Content-Disposition'] = f'attachment; filename="{escape_uri_path(file_name)}"'
+        response['Cache-Control'] = 'public, max-age=86400'  # 1天缓存
+    
+    return response
+
+# 添加storage路由
+urlpatterns.insert(0, re_path(r'^storage/(?P<file_path>.*)$', serve_storage_file, name='storage_file'))
+

 def get_index_html(index_path):
    file = open(index_path, "r", encoding='utf-8')
--- a/apps/oss/urls.py
+++ b/apps/oss/urls.py
@ -6,4 +6,5 @@ app_name = 'oss'

 urlpatterns = [
    path('oss/file', views.FileView.as_view()),
+    # storage路由已移至主URL配置中
 ]
--- a/apps/oss/views/init.py
+++ b/apps/oss/views/init.py
@ -1 +1,2 @@
 from .file import *
+from .storage import StorageFileView
--- a/apps/oss/views/storage.py
+++ b/apps/oss/views/storage.py
@ -0,0 +1,87 @@
+# coding=utf-8
+"""
+Storage file service for MinerU parsed images
+"""
+import os
+import mimetypes
+from pathlib import Path
+
+from django.http import HttpResponse, Http404
+from django.utils.encoding import escape_uri_path
+from django.views import View
+
+
+class StorageFileView(View):
+    """
+    静态文件服务视图，用于提供MinerU解析后的图片访问
+    使用Django基础View类，完全不涉及认证系统
+    """
+    
+    def get(self, request, file_path: str):
+        """
+        获取存储的文件
+        
+        Args:
+            request: HTTP请求
+            file_path: 文件相对路径（如：mineru/images/xxx.jpg）
+        
+        Returns:
+            文件内容或404错误
+        """
+        # 基础存储路径（从环境变量读取，默认为/opt/maxkb/storage）
+        base_path = os.getenv('MAXKB_STORAGE_PATH', '/opt/maxkb/storage')
+        
+        # 构建完整文件路径
+        full_path = os.path.join(base_path, file_path)
+        
+        # 安全检查：确保请求的路径在base_path内
+        try:
+            # 规范化路径，解析符号链接等
+            real_base = os.path.realpath(base_path)
+            real_path = os.path.realpath(full_path)
+            
+            # 确保文件路径在基础路径内（防止路径遍历攻击）
+            if not real_path.startswith(real_base):
+                raise Http404("File not found")
+        except (OSError, ValueError):
+            raise Http404("File not found")
+        
+        # 检查文件是否存在
+        if not os.path.exists(full_path) or not os.path.isfile(full_path):
+            raise Http404("File not found")
+        
+        # 读取文件内容
+        try:
+            with open(full_path, 'rb') as f:
+                file_content = f.read()
+        except IOError:
+            raise Http404("File not found")
+        
+        # 获取文件MIME类型
+        content_type, _ = mimetypes.guess_type(full_path)
+        if not content_type:
+            content_type = 'application/octet-stream'
+        
+        # 构建响应
+        response = HttpResponse(file_content, content_type=content_type)
+        
+        # 设置文件名（用于下载）
+        file_name = os.path.basename(full_path)
+        # 对于图片类型，使用inline显示；其他类型使用attachment下载
+        if content_type.startswith('image/'):
+            disposition = 'inline'
+        else:
+            disposition = 'attachment'
+        
+        # 使用escape_uri_path处理文件名中的特殊字符
+        response['Content-Disposition'] = f'{disposition}; filename="{escape_uri_path(file_name)}"'
+        
+        # 设置缓存控制（图片可以缓存较长时间）
+        if content_type.startswith('image/'):
+            # 图片缓存30天
+            response['Cache-Control'] = 'public, max-age=2592000'
+        else:
+            # 其他文件缓存1天
+            response['Cache-Control'] = 'public, max-age=86400'
+        
+        return response
--- a/dev/docker-compose-simple.yml
+++ b/dev/docker-compose-simple.yml
@ -15,12 +15,50 @@ services:
      - ../tmp:/tmp
      # 数据持久化
      - ~/.maxkb:/opt/maxkb
+      # MinerU图片存储目录持久化
+      - ~/.maxkb/storage:/opt/maxkb/storage:rw
    environment:
      # 开启调试模式
      DJANGO_DEBUG: "True"
      PYTHONUNBUFFERED: "1"
      MAXKB_LOG_LEVEL: "DEBUG"
+      
+      # MinerU 配置
+      MINERU_PARSER_CACHE: "True"
+      MINERU_MULTIMODAL_REFINEMENT: "True"
+      # MinerU 图片存储路径
+      MAXKB_STORAGE_PATH: "/opt/maxkb/storage"
+      
      MINERU_API_TYPE: "self_hosted"
      MINERU_API_URL: "http://mineru:8000"
+      
+      # MINERU_API_TYPE: "cloud"
+      # MINERU_API_URL: "https://mineru.net"
+      # MINERU_API_KEY: "eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiI2OTYwMDEwNiIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc1NTE2MzQ5OCwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiIiwib3BlbklkIjpudWxsLCJ1dWlkIjoiYTQwODk5NjMtNDI1OS00MWM3LWE3NWItY2IzZTQ4NTRjYWIwIiwiZW1haWwiOiIiLCJleHAiOjE3NTYzNzMwOTh9.88m9JSKQhkwJ557jCTFOgmdjeAfpXzxy2QDINaJ0rfHfnMNBxQt47aHr2jABeuxW-fXm8S5AO7zWWTXEGx8BxA"
+      
+      # 配置队列大小
+      MINERU_QUEUE_SIZE: "50"
+      # 配置处理超时时间（秒）
+      MINERU_PROCESSING_TIMEOUT: "7200"
+      # 配置各线程数量（支持多线程）
+      MINERU_PARSER_THREADS: "1"
+      MINERU_REFINER_THREADS: "3"
+      MINERU_RECOGNIZER_THREADS: "3"
+      MINERU_UPLOADER_THREADS: "1"
+      
+      MINERU_BATCH_PROCESSING: "true"
+      MINERU_BATCH_SIZE: "10"
+      # 启用/禁用过滤
+      MINERU_SKIP_SMALL_IMAGES: "true"
+      # 每页最多图片数
+      MINERU_MAX_IMAGES_PER_PAGE: "10"
+      # 文档最多图片数
+      MINERU_MAX_IMAGES_PER_DOCUMENT: "200"
+      # 最小图片尺寸（像素）
+      MINERU_MIN_IMAGE_SIZE: "10000"
+      # 最大图片尺寸（像素）
+      MINERU_MAX_IMAGE_SIZE: "10000000"
+      # 过滤无意义图片
+      MINERU_FILTER_MEANINGLESS: "true"
    restart: unless-stopped
    # 使用镜像默认的启动命令
--- a/installer/Dockerfile
+++ b/installer/Dockerfile
@ -75,6 +75,11 @@ ENV MAXKB_VERSION="${DOCKER_IMAGE_TAG} (build at ${BUILD_AT}, commit: ${GITHUB_C
    PATH=/opt/py3/bin:$PATH \
    PIP_TARGET=/opt/maxkb/python-packages

+# Install poppler-utils for PDF processing (required by MinerU)
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends poppler-utils && \
+    apt-get clean all && \
+    rm -rf /var/lib/apt/lists/*

 WORKDIR /opt/maxkb-app
 COPY --from=stage-build /opt/maxkb-app /opt/maxkb-app
--- a/test_image_access.py
+++ b/test_image_access.py
@ -0,0 +1,59 @@
+#!/usr/bin/env python
+"""
+测试图片存储和访问
+
+这个脚本会：
+1. 创建一个测试图片在存储目录
+2. 打印正确的访问URL
+"""
+
+import os
+import sys
+
+def main():
+    # 设置存储路径（本地开发环境）
+    storage_path = os.getenv('MAXKB_STORAGE_PATH', './tmp/maxkb/storage')
+    
+    print("=" * 60)
+    print("MaxKB 图片存储和访问测试")
+    print("=" * 60)
+    
+    # 创建目录结构
+    image_dir = os.path.join(storage_path, 'mineru', 'images')
+    os.makedirs(image_dir, exist_ok=True)
+    print(f"\n1. 存储目录：{image_dir}")
+    
+    # 创建测试图片文件
+    test_image = os.path.join(image_dir, 'ac3681aaa7a346b49ef9c7ceb7b94058.jpg')
+    with open(test_image, 'wb') as f:
+        # 写入一个简单的测试内容（实际应该是图片二进制数据）
+        f.write(b'TEST IMAGE CONTENT')
+    print(f"2. 创建测试文件：{test_image}")
+    
+    # 生成访问URL
+    print("\n3. 访问URL：")
+    print(f"   本地开发：http://localhost:8080/storage/mineru/images/ac3681aaa7a346b49ef9c7ceb7b94058.jpg")
+    print(f"   Docker环境：http://localhost:8080/storage/mineru/images/ac3681aaa7a346b49ef9c7ceb7b94058.jpg")
+    
+    # 列出当前存储目录的所有文件
+    print(f"\n4. 存储目录内容：")
+    for root, dirs, files in os.walk(storage_path):
+        level = root.replace(storage_path, '').count(os.sep)
+        indent = '  ' * level
+        print(f'{indent}{os.path.basename(root)}/')
+        subindent = '  ' * (level + 1)
+        for file in files:
+            file_path = os.path.join(root, file)
+            file_size = os.path.getsize(file_path)
+            print(f'{subindent}{file} ({file_size} bytes)')
+    
+    print("\n" + "=" * 60)
+    print("测试完成！")
+    print("\n注意事项：")
+    print("1. 确保Django服务器正在运行")
+    print("2. URL路径现在是 /storage/ 开头，简洁直接")
+    print("3. 如果使用Docker，确保volume正确挂载")
+    print("=" * 60)
+
+if __name__ == "__main__":
+    main()
--- a/test_storage.py
+++ b/test_storage.py
@ -0,0 +1,131 @@
+#!/usr/bin/env python
+"""
+测试MinerU图片存储和访问功能
+
+使用方法：
+1. 在本地开发环境：python test_storage.py
+2. 在Docker环境：docker exec -it maxkb-dev python /opt/maxkb-app/test_storage.py
+"""
+
+import os
+import sys
+import tempfile
+import shutil
+from pathlib import Path
+
+def test_storage():
+    """测试存储功能"""
+    print("=" * 60)
+    print("MinerU 图片存储测试")
+    print("=" * 60)
+    
+    # 1. 检查存储路径配置
+    storage_path = os.getenv('MAXKB_STORAGE_PATH', '/opt/maxkb/storage')
+    print(f"\n1. 存储路径配置：{storage_path}")
+    
+    # 2. 创建测试目录结构
+    test_dir = os.path.join(storage_path, 'test', 'images')
+    print(f"\n2. 创建测试目录：{test_dir}")
+    os.makedirs(test_dir, exist_ok=True)
+    
+    # 3. 创建测试图片文件
+    test_image_path = os.path.join(test_dir, 'test_image.txt')
+    print(f"\n3. 创建测试文件：{test_image_path}")
+    with open(test_image_path, 'w') as f:
+        f.write("This is a test image file for MinerU storage")
+    
+    # 4. 验证文件创建
+    if os.path.exists(test_image_path):
+        print("   ✓ 文件创建成功")
+        file_size = os.path.getsize(test_image_path)
+        print(f"   文件大小：{file_size} bytes")
+    else:
+        print("   ✗ 文件创建失败")
+        return False
+    
+    # 5. 生成访问URL
+    relative_path = os.path.relpath(test_image_path, storage_path)
+    access_url = f"/api/storage/{relative_path}"
+    print(f"\n4. 生成的访问URL：{access_url}")
+    
+    # 6. 列出存储目录内容
+    print(f"\n5. 存储目录内容：")
+    for root, dirs, files in os.walk(storage_path):
+        level = root.replace(storage_path, '').count(os.sep)
+        indent = ' ' * 2 * level
+        print(f'{indent}{os.path.basename(root)}/')
+        subindent = ' ' * 2 * (level + 1)
+        for file in files:
+            print(f'{subindent}{file}')
+    
+    print("\n" + "=" * 60)
+    print("测试完成！")
+    print("\n配置建议：")
+    print("1. 确保Docker volume正确挂载：~/.maxkb/storage:/opt/maxkb/storage")
+    print("2. 确保环境变量设置：MAXKB_STORAGE_PATH=/opt/maxkb/storage")
+    print("3. 访问图片URL格式：http://localhost:8080/api/storage/mineru/images/xxx.jpg")
+    print("=" * 60)
+    
+    return True
+
+def test_mineru_adapter():
+    """测试MinerU适配器"""
+    print("\n" + "=" * 60)
+    print("测试MinerU适配器")
+    print("=" * 60)
+    
+    # 添加apps目录到Python路径
+    sys.path.insert(0, '/opt/maxkb-app/apps' if os.path.exists('/opt/maxkb-app/apps') else './apps')
+    
+    try:
+        from common.handle.impl.mineru.maxkb_adapter.adapter import MaxKBAdapter
+        
+        print("\n1. 创建MaxKB适配器实例")
+        adapter = MaxKBAdapter()
+        print(f"   存储路径：{adapter.storage_path}")
+        
+        # 创建临时测试文件
+        with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp:
+            tmp.write(b"Test image content")
+            tmp_path = tmp.name
+        
+        print(f"\n2. 测试upload_file方法")
+        print(f"   源文件：{tmp_path}")
+        
+        # 使用异步方式调用
+        import asyncio
+        async def test_upload():
+            result = await adapter.upload_file(tmp_path, options=['test_knowledge'])
+            return result
+        
+        # 运行异步测试
+        try:
+            loop = asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+        
+        result_url = loop.run_until_complete(test_upload())
+        print(f"   返回URL：{result_url}")
+        
+        # 清理临时文件
+        os.unlink(tmp_path)
+        
+        print("\n✓ MinerU适配器测试成功")
+        
+    except ImportError as e:
+        print(f"\n✗ 无法导入MinerU适配器：{e}")
+        print("  请确保在MaxKB环境中运行此测试")
+    except Exception as e:
+        print(f"\n✗ 测试失败：{e}")
+        import traceback
+        traceback.print_exc()
+
+if __name__ == "__main__":
+    # 运行存储测试
+    if test_storage():
+        # 如果基础存储测试成功，尝试测试适配器
+        try:
+            test_mineru_adapter()
+        except:
+            print("\n提示：适配器测试需要在MaxKB环境中运行")
--- a/test_storage_simple.py
+++ b/test_storage_simple.py
@ -0,0 +1,22 @@
+#!/usr/bin/env python
+"""
+简单的存储测试 - 创建测试图片
+"""
+import os
+
+# 创建存储目录
+storage_path = './tmp/maxkb/storage/mineru/images'
+os.makedirs(storage_path, exist_ok=True)
+
+# 创建测试图片（实际是一个文本文件，但后缀是.jpg）
+test_file = os.path.join(storage_path, 'ac3681aaa7a346b49ef9c7ceb7b94058.jpg')
+with open(test_file, 'wb') as f:
+    # 写入一个最小的JPEG文件头（这样浏览器会识别为图片）
+    # FF D8 FF E0 是JPEG文件的魔术数字
+    f.write(bytes.fromhex('FFD8FFE000104A46494600010101006000600000FFDB004300080606070605080707070909080A0C140D0C0B0B0C1912130F141D1A1F1E1D1A1C1C20242E2720222C231C1C2837292C30313434341F27393D38323C2E333432FFDB0043010909090C0B0C180D0D1832211C2132323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232FFC00011080001000103012200021101031101FFC4001F0000010501010101010100000000000000000102030405060708090A0BFFC400B5100002010303020403050504040000017D01020300041105122131410613516107227114328191A1082342B1C11552D1F02433627282090A161718191A25262728292A3435363738393A434445464748494A535455565758595A636465666768696A737475767778797A838485868788898A92939495969798999AA2A3A4A5A6A7A8A9AAB2B3B4B5B6B7B8B9BAC2C3C4C5C6C7C8C9CAD2D3D4D5D6D7D8D9DAE1E2E3E4E5E6E7E8E9EAF1F2F3F4F5F6F7F8F9FAFFC4001F0100030101010101010101010000000000000102030405060708090A0BFFC400B51100020102040403040705040400010277000102031104052131061241510761711322328108144291A1B1C109233352F0156272D10A162434E125F11718191A262728292A35363738393A434445464748494A535455565758595A636465666768696A737475767778797A82838485868788898A92939495969798999AA2A3A4A5A6A7A8A9AAB2B3B4B5B6B7B8B9BAC2C3C4C5C6C7C8C9CAD2D3D4D5D6D7D8D9DAE2E3E4E5E6E7E8E9EAF2F3F4F5F6F7F8F9FAFFDA000C03010002110311003F00F9FFD9'))
+
+print(f"测试文件已创建：{test_file}")
+print(f"文件大小：{os.path.getsize(test_file)} bytes")
+print("\n访问URL：")
+print("http://localhost:8080/storage/mineru/images/ac3681aaa7a346b49ef9c7ceb7b94058.jpg")
+print("\n如果Django服务正在运行，可以直接在浏览器中访问上述URL")