From f0263bf189fe7a98467d02a259bc26c6c62ff7c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= Date: Sun, 24 Aug 2025 17:45:40 +0800 Subject: [PATCH] add mineru --- MINERU_STORAGE_README.md | 61 +++++ apps/common/handle/impl/mineru/base_parser.py | 29 ++- .../impl/mineru/gbase_adapter/adapter.py | 7 +- .../handle/impl/mineru/image_optimizer.py | 9 +- .../handle/impl/mineru/image_processor.py | 53 +++- apps/common/handle/impl/mineru/logger.py | 5 +- .../impl/mineru/maxkb_adapter/adapter.py | 74 +++++- .../impl/mineru/maxkb_adapter/config_maxkb.py | 71 ++++-- .../maxkb_adapter/maxkb_model_client.py | 67 ++++- .../handle/impl/mineru/parallel_processor.py | 236 ++++++++++-------- .../mineru/prompts/image_classification.py | 12 - apps/maxkb/urls.py | 59 +++++ apps/oss/urls.py | 1 + apps/oss/views/__init__.py | 3 +- apps/oss/views/storage.py | 87 +++++++ dev/docker-compose-simple.yml | 38 +++ installer/Dockerfile | 5 + test_image_access.py | 59 +++++ test_storage.py | 131 ++++++++++ test_storage_simple.py | 22 ++ 20 files changed, 845 insertions(+), 184 deletions(-) create mode 100644 MINERU_STORAGE_README.md create mode 100644 apps/oss/views/storage.py create mode 100644 test_image_access.py create mode 100644 test_storage.py create mode 100644 test_storage_simple.py diff --git a/MINERU_STORAGE_README.md b/MINERU_STORAGE_README.md new file mode 100644 index 00000000..a7ea921a --- /dev/null +++ b/MINERU_STORAGE_README.md @@ -0,0 +1,61 @@ +# MinerU 图片存储配置说明 + +## 访问URL格式 + +MinerU解析后的图片访问URL格式为: +``` +http://localhost:8080/storage/mineru/images/xxx.jpg +``` + +简洁直接,直接使用 `/storage/` 路径访问 + +## 存储路径配置 + +### 本地开发环境 +- 存储路径:`./tmp/maxkb/storage/` +- 图片位置:`./tmp/maxkb/storage/mineru/images/` + +### Docker环境 +- 容器内路径:`/opt/maxkb/storage/` +- 本地映射路径:`~/.maxkb/storage/` +- 图片位置:`~/.maxkb/storage/mineru/images/` + +## 环境变量配置 + +在 `.env` 文件或 docker-compose.yml 中添加: +```bash +MAXKB_STORAGE_PATH=/opt/maxkb/storage +``` + +## Docker Volume配置 + +在 `docker-compose.yml` 中已配置: +```yaml +volumes: + - ~/.maxkb/storage:/opt/maxkb/storage:rw +``` + +## 测试访问 + +1. 运行测试脚本创建测试图片: +```bash +python test_image_access.py +``` + +2. 访问测试URL: +``` +http://localhost:8080/storage/mineru/images/ac3681aaa7a346b49ef9c7ceb7b94058.jpg +``` + +## 故障排查 + +1. **404错误**:检查文件是否存在于存储目录 +2. **权限错误**:确保存储目录有写入权限 +3. **路径错误**:确认URL路径以 `/storage/` 开头 + +## 相关文件 + +- 存储视图:`apps/oss/views/storage.py` +- URL配置:`apps/oss/urls.py` +- MinerU适配器:`apps/common/handle/impl/mineru/maxkb_adapter/adapter.py` +- 配置文件:`apps/common/handle/impl/mineru/maxkb_adapter/config_maxkb.py` \ No newline at end of file diff --git a/apps/common/handle/impl/mineru/base_parser.py b/apps/common/handle/impl/mineru/base_parser.py index 994ba25d..d70e3c9a 100644 --- a/apps/common/handle/impl/mineru/base_parser.py +++ b/apps/common/handle/impl/mineru/base_parser.py @@ -268,12 +268,35 @@ class BaseMinerUExtractor: self.logger.info(f"mineru-parser: found {len(cached_pages)} cached pages") # 获取上传回调(通过适配器) - upload_callback = lambda file_path: self.adapter.upload_file(file_path, upload_options) + # 总是创建upload_callback,让适配器决定如何处理 + # upload_func 需要接受4个参数: filepath, filename, upload_options, binary_data + async def upload_callback(filepath, filename, options, binary_data=None): + # 如果有 binary_data,说明图片被压缩了,需要先保存到临时文件 + if binary_data: + import tempfile + with tempfile.NamedTemporaryFile(suffix=os.path.splitext(filename)[1], delete=False) as tmp: + tmp.write(binary_data) + tmp_path = tmp.name + + result = await self.adapter.upload_file(tmp_path, upload_options) + # 清理临时文件 + try: + os.unlink(tmp_path) + except: + pass + elif filepath: + result = await self.adapter.upload_file(filepath, upload_options) + else: + self.logger.warning(f"No file path or binary data provided for {filename}") + return None, None + + # 返回 (url, upload_key) 格式 + return result, None # 并行处理文档 completed_tasks = await self.parallel_processor.process_document_with_cache( pdf_path, temp_dir, src_fileid, is_ppt_format, - len(pages_info), upload_callback if upload_options else None, upload_options, + len(pages_info), upload_callback, upload_options, cached_pages=cached_pages, save_callback=lambda idx, data: self._save_page_cache(temp_dir, idx, data) ) @@ -724,4 +747,4 @@ class BaseMinerUExtractor: except Exception as e: self.logger.error(f"mineru-parser: failed to load cached pages: {str(e)}") - return cached_pages \ No newline at end of file + return cached_pages diff --git a/apps/common/handle/impl/mineru/gbase_adapter/adapter.py b/apps/common/handle/impl/mineru/gbase_adapter/adapter.py index 62f6001f..e3e0d8fd 100644 --- a/apps/common/handle/impl/mineru/gbase_adapter/adapter.py +++ b/apps/common/handle/impl/mineru/gbase_adapter/adapter.py @@ -45,6 +45,11 @@ class GPTBaseAdapter(PlatformAdapter): async def upload_file(self, file_path: str, options: Any = None) -> str: """上传文件 - PDF使用S3特殊处理,图片使用gzero上传""" + # 如果在测试模式下,直接返回原图地址 + if os.getenv('MINERU_TEST_FILE'): + logger.info(f"Gbase: Test mode - returning original path: {file_path}") + return file_path + import os # 判断文件类型 @@ -103,4 +108,4 @@ class MinerUExtractor(BaseMinerUExtractor): super().__init__(adapter, config=config, learn_type=learn_type) # 保存learn_type供后续使用(基类已经保存了,这里是为了兼容性) - self.learn_type = learn_type \ No newline at end of file + self.learn_type = learn_type diff --git a/apps/common/handle/impl/mineru/image_optimizer.py b/apps/common/handle/impl/mineru/image_optimizer.py index 3298b54f..29c02429 100644 --- a/apps/common/handle/impl/mineru/image_optimizer.py +++ b/apps/common/handle/impl/mineru/image_optimizer.py @@ -224,10 +224,10 @@ class ImageOptimizer: upload_options, max_retries: int = 3, retry_delay: float = 1.0) -> Tuple[Optional[str], Optional[str]]: - if os.getenv('MINERU_TEST_FILE'): - return image_info.filepath, None - # return image_info.filepath, None """上传单个图片(带并发控制和重试机制)""" + # 注释掉测试模式,让上传回调能够被调用 + # if os.getenv('MINERU_TEST_FILE'): + # return image_info.filepath, None async with self.upload_semaphore: # 处理图片 image_data, hash_value = await self.process_image_for_upload(image_info) @@ -344,8 +344,9 @@ class ImageOptimizer: async with self.api_semaphore: try: # 调用分类函数 + # classify_func expects: (learn_type, image_filepath, temp_dir, src_name, hint) return await classify_func( - vision_model, + vision_model, # This is actually learn_type img_data['image_info'].filepath, temp_dir, src_name, diff --git a/apps/common/handle/impl/mineru/image_processor.py b/apps/common/handle/impl/mineru/image_processor.py index 80d241e2..5264cc96 100644 --- a/apps/common/handle/impl/mineru/image_processor.py +++ b/apps/common/handle/impl/mineru/image_processor.py @@ -517,11 +517,16 @@ class MinerUImageProcessor: This is an enhanced version that uses context when available. """ + self.logger.info(f"mineru-image: _classify_single_image_with_context called for {os.path.basename(image_filepath)}") + # If no context, fall back to original method if not context: + self.logger.info(f"mineru-image: no context, falling back to original method") return await self._classify_single_image(learn_type, image_filepath, temp_dir, src_name, hint) try: + self.logger.info(f"mineru-image: processing with context for {os.path.basename(image_filepath)}") + if not os.path.exists(image_filepath): raise FileNotFoundError(f"Image file not found: {image_filepath}") @@ -535,6 +540,9 @@ class MinerUImageProcessor: # Build context-aware prompt with language prompt = self._build_context_aware_prompt(context, language_code) + # Log the final prompt for debugging + self.logger.info(f"mineru-image: Final prompt for {os.path.basename(image_filepath)}:\n{prompt[:1000]}...") + messages = [ {'role': 'system', 'content': prompt}, {'role': 'user', 'content': [ @@ -549,19 +557,34 @@ class MinerUImageProcessor: start_time = time.time() try: + self.logger.info(f"mineru-image: calling vision model for {os.path.basename(image_filepath)}") response = await self.config.call_litellm( model_type=learn_type, messages=messages, temperature=0.0, timeout=120.0 # Increased timeout to 120 seconds for vision models ) + self.logger.info(f"mineru-image: received response from vision model") duration = time.time() - start_time + # Log raw response for debugging + raw_response = response.choices[0].message.content if response.choices else "" + self.logger.info(f"mineru-image: raw AI response (first 500 chars): {raw_response[:500] if raw_response else 'Empty response'}") + # Log complete response for debugging + self.logger.info(f"mineru-image: FULL AI response for {os.path.basename(image_filepath)}:\n{raw_response}") + + # Log usage info + if hasattr(response, 'usage'): + self.logger.info(f"mineru-image: usage - prompt_tokens={getattr(response.usage, 'prompt_tokens', 0)}, " + f"completion_tokens={getattr(response.usage, 'completion_tokens', 0)}") + else: + self.logger.warning(f"mineru-image: no usage info in response") + # Parse enhanced response result = self._parse_context_aware_response( - response.choices[0].message.content, - response.usage, + raw_response, + response.usage if hasattr(response, 'usage') else None, duration ) @@ -569,15 +592,21 @@ class MinerUImageProcessor: result['has_context'] = True result['page_idx'] = context.page_idx + # Log successful classification + self.logger.info(f"mineru-image: classified {os.path.basename(image_filepath)} as {result.get('type', 'unknown')} " + f"(tokens: in={result.get('input_tokens', 0)}, out={result.get('output_tokens', 0)})") + except Exception as e: self.logger.error(f"mineru-image: classification error: {str(e)}") + self.logger.info(f"mineru-image: classification failed for {os.path.basename(image_filepath)}, returning meaningless") result = { 'type': 'meaningless', 'content': f'Classification error: {str(e)}', 'input_tokens': 0, 'output_tokens': 0, 'dura': time.time() - start_time, - 'has_context': True + 'has_context': True, + 'error': str(e) } return result @@ -652,13 +681,16 @@ class MinerUImageProcessor: # Parse JSON result_json = json.loads(response_content) + # Log the raw classification response for debugging + self.logger.info(f"mineru-image: parsed JSON response: {result_json}") + # Build result dictionary result = { 'type': result_json.get('type', 'meaningless'), 'title': result_json.get('title', ''), 'content': result_json.get('description', ''), - 'input_tokens': usage.prompt_tokens if hasattr(usage, 'prompt_tokens') else 0, - 'output_tokens': usage.completion_tokens if hasattr(usage, 'completion_tokens') else 0, + 'input_tokens': usage.prompt_tokens if usage and hasattr(usage, 'prompt_tokens') else 0, + 'output_tokens': usage.completion_tokens if usage and hasattr(usage, 'completion_tokens') else 0, 'dura': duration } @@ -670,13 +702,14 @@ class MinerUImageProcessor: except Exception as e: self.logger.error(f"mineru-image: failed to parse context response: {str(e)}") + self.logger.debug(f"mineru-image: response that failed to parse: {response_content[:500] if response_content else 'Empty'}") # Return a basic result return { 'type': 'brief_description', 'title': '', 'content': response_content[:200] if response_content else '', - 'input_tokens': usage.prompt_tokens if hasattr(usage, 'prompt_tokens') else 0, - 'output_tokens': usage.completion_tokens if hasattr(usage, 'completion_tokens') else 0, + 'input_tokens': usage.prompt_tokens if usage and hasattr(usage, 'prompt_tokens') else 0, + 'output_tokens': usage.completion_tokens if usage and hasattr(usage, 'completion_tokens') else 0, 'dura': duration } @@ -736,6 +769,9 @@ class MinerUImageProcessor: # Parse response response_content = response.choices[0].message.content + # Log complete response for debugging + self.logger.info(f"mineru-image: FULL AI response for {os.path.basename(image_filepath)} (no context):\n{response_content}") + # Extract JSON from markdown code block if present if '```json' in response_content and '```' in response_content: try: @@ -797,7 +833,8 @@ class MinerUImageProcessor: 'dura': time.time() - start_time, } - self.logger.info(f"mineru-image: classified {image_filepath} as {result.get('type', 'unknown')}") + # Enhanced logging to debug meaningless classification + self.logger.info(f"mineru-image: classified {image_filepath} as {result.get('type', 'unknown')} - tokens: in={result.get('input_tokens', 0)}, out={result.get('output_tokens', 0)}, error={result.get('error', 'None')}") return result diff --git a/apps/common/handle/impl/mineru/logger.py b/apps/common/handle/impl/mineru/logger.py index 9e8ffc35..fd24312c 100644 --- a/apps/common/handle/impl/mineru/logger.py +++ b/apps/common/handle/impl/mineru/logger.py @@ -83,7 +83,10 @@ def get_module_logger(module_name): pass # 默认情况,创建独立的logger - return logging.getLogger(module_name) + module_logger = logging.getLogger(module_name) + # 确保使用正确的日志级别 + module_logger.setLevel(logging.INFO) + return module_logger # 为了兼容性,导出默认logger logger = get_module_logger('mineru') diff --git a/apps/common/handle/impl/mineru/maxkb_adapter/adapter.py b/apps/common/handle/impl/mineru/maxkb_adapter/adapter.py index c7f5f9d3..9c86608c 100644 --- a/apps/common/handle/impl/mineru/maxkb_adapter/adapter.py +++ b/apps/common/handle/impl/mineru/maxkb_adapter/adapter.py @@ -29,6 +29,11 @@ class MaxKBAdapter(PlatformAdapter): """初始化MaxKB适配器""" self.file_storage = FileStorageClient() self.model_client = maxkb_model_client + + # 导入配置以获取存储路径 + from .config_maxkb import MaxKBMinerUConfig + self.config = MaxKBMinerUConfig() + self.storage_path = self.config.file_storage_path @contextlib.asynccontextmanager async def trace_context(self, trace_id: str): @@ -51,19 +56,68 @@ class MaxKBAdapter(PlatformAdapter): logger.debug(f"MaxKB: Released lock for {temp_dir}") async def upload_file(self, file_path: str, options: Any = None) -> str: - """使用MaxKB的文件存储上传文件""" + """使用MaxKB的文件存储上传文件 - 直接复制文件到存储目录""" + import shutil + import uuid + + logger.info(f"MaxKB: upload_file called with path={file_path}, options={options}") + # 如果在测试模式下,直接返回原图地址 - if os.getenv('MINERU_TEST_FILE'): - logger.info(f"MaxKB: Test mode - returning original path: {file_path}") - return file_path + #if os.getenv('MINERU_TEST_FILE'): + # logger.info(f"MaxKB: Test mode - returning original path: {file_path}") + # return file_path try: - # 使用MaxKB的文件存储客户端上传 - uploaded_url = await self.file_storage.upload_image(file_path) - return uploaded_url + # 确保文件存在 + if not os.path.exists(file_path): + logger.warning(f"MaxKB: File not found: {file_path}") + return file_path + + # 获取knowledge_id(如果在options中提供) + knowledge_id = None + if options and isinstance(options, (tuple, list)) and len(options) > 0: + knowledge_id = options[0] + + # 创建存储目录结构 + # 使用 knowledge_id 或 'mineru' 作为子目录 + sub_dir = knowledge_id if knowledge_id else 'mineru' + storage_dir = os.path.join(self.storage_path, sub_dir, 'images') + + # 确保存储目录存在 + os.makedirs(storage_dir, exist_ok=True) + + # 生成唯一的文件名,保留原始扩展名 + file_ext = os.path.splitext(file_path)[1] + file_name = f"{uuid.uuid4().hex}{file_ext}" + dest_path = os.path.join(storage_dir, file_name) + + # 复制文件到存储目录 + shutil.copy2(file_path, dest_path) + + # 返回相对路径或URL格式 + # 生成相对于storage根目录的路径 + relative_path = os.path.relpath(dest_path, self.storage_path) + # 确保路径使用正斜杠(兼容所有系统) + relative_path = relative_path.replace(os.path.sep, '/') + + # 根据环境配置生成完整的URL + # 检查是否配置了基础URL + base_url = os.getenv('MAXKB_BASE_URL', '') + if base_url: + # 如果有基础URL,生成完整的URL + result_url = f"{base_url.rstrip('/')}/storage/{relative_path}" + else: + # 生成相对URL,直接使用/storage/路径 + result_url = f"/storage/{relative_path}" + + logger.info(f"MaxKB: Copied file {file_path} -> {dest_path}") + logger.debug(f"MaxKB: Returning URL: {result_url}") + + return result_url + except Exception as e: - logger.error(f"MaxKB: Failed to upload file {file_path}: {str(e)}") - # 如果上传失败,返回本地路径 + logger.error(f"MaxKB: Failed to copy file {file_path}: {str(e)}") + # 如果复制失败,返回本地路径 return file_path def get_logger(self): @@ -332,4 +386,4 @@ class MinerUAdapter: except Exception as e: logger.error(f"MinerU处理文档失败: {str(e)}") # 返回空结果而不是抛出异常,让调用方可以回退到其他处理器 - return {'sections': []} \ No newline at end of file + return {'sections': []} diff --git a/apps/common/handle/impl/mineru/maxkb_adapter/config_maxkb.py b/apps/common/handle/impl/mineru/maxkb_adapter/config_maxkb.py index c328f791..c62bf975 100644 --- a/apps/common/handle/impl/mineru/maxkb_adapter/config_maxkb.py +++ b/apps/common/handle/impl/mineru/maxkb_adapter/config_maxkb.py @@ -28,7 +28,7 @@ class MaxKBMinerUConfig(MinerUConfig): # File storage settings self.file_storage_type = os.getenv('MAXKB_STORAGE_TYPE', 'local') # local, s3, oss - self.file_storage_path = os.getenv('MAXKB_STORAGE_PATH', '/tmp/maxkb/storage') + self.file_storage_path = os.getenv('MAXKB_STORAGE_PATH', '/opt/maxkb/storage') self.file_storage_bucket = os.getenv('MAXKB_STORAGE_BUCKET') # Model client settings @@ -133,37 +133,48 @@ class MaxKBMinerUConfig(MinerUConfig): # Call appropriate method based on content type if has_images: - # Extract image and text for vision model + # Extract image and combine all text content for vision model image_path = None - prompt = "" + combined_prompt = "" + + # First, collect system message if exists for msg in messages: - if isinstance(msg.get('content'), list): - for content_item in msg['content']: - if content_item.get('type') == 'text': - prompt = content_item.get('text', '') - elif content_item.get('type') == 'image_url': - image_url = content_item.get('image_url', {}) - if isinstance(image_url, dict): - url = image_url.get('url', '') - if url.startswith('data:'): - # Handle base64 image - import base64 - import tempfile - # Extract base64 data - base64_data = url.split(',')[1] if ',' in url else url - image_data = base64.b64decode(base64_data) - # Save to temp file - with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp: - tmp.write(image_data) - image_path = tmp.name - else: - image_path = url + if msg.get('role') == 'system': + combined_prompt = msg.get('content', '') + "\n\n" + break + + # Then extract user message content + for msg in messages: + if msg.get('role') == 'user': + if isinstance(msg.get('content'), list): + for content_item in msg['content']: + if content_item.get('type') == 'text': + combined_prompt += content_item.get('text', '') + elif content_item.get('type') == 'image_url': + image_url = content_item.get('image_url', {}) + if isinstance(image_url, dict): + url = image_url.get('url', '') + if url.startswith('data:'): + # Handle base64 image + import base64 + import tempfile + # Extract base64 data + base64_data = url.split(',')[1] if ',' in url else url + image_data = base64.b64decode(base64_data) + # Save to temp file + with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp: + tmp.write(image_data) + image_path = tmp.name + else: + image_path = url + elif isinstance(msg.get('content'), str): + combined_prompt += msg.get('content', '') if image_path: response_text = await maxkb_model_client.vision_completion( model_id=model_id, image_path=image_path, - prompt=prompt, + prompt=combined_prompt, **kwargs ) else: @@ -215,8 +226,14 @@ class MaxKBMinerUConfig(MinerUConfig): 'total_tokens': 0 })() - # Return empty response on error to continue processing - return MockResponse("") + # Return a valid JSON response on error to prevent parsing issues + # This will be parsed as a brief_description type + error_response = json.dumps({ + "type": "brief_description", + "title": "Error", + "description": f"Model call failed: {str(e)}" + }) + return MockResponse(error_response) def _get_default_llm_model_id(self) -> str: """获取默认的LLM模型ID""" diff --git a/apps/common/handle/impl/mineru/maxkb_adapter/maxkb_model_client.py b/apps/common/handle/impl/mineru/maxkb_adapter/maxkb_model_client.py index d51744d8..71733215 100644 --- a/apps/common/handle/impl/mineru/maxkb_adapter/maxkb_model_client.py +++ b/apps/common/handle/impl/mineru/maxkb_adapter/maxkb_model_client.py @@ -187,8 +187,13 @@ class MaxKBModelClient: llm_model = await self.get_llm_model(model_id) if not llm_model: - self.logger.warning(f"No model available for {model_id}, returning empty response") - return "" + self.logger.warning(f"No model available for {model_id}, returning error JSON") + import json + return json.dumps({ + "type": "brief_description", + "title": "No Model", + "description": "LLM model not available" + }) # 调用模型 - 使用 sync_to_async 包装同步调用 response = await sync_to_async(llm_model.invoke)(messages) @@ -203,8 +208,13 @@ class MaxKBModelClient: except Exception as e: self.logger.error(f"Chat completion failed: {str(e)}") - # 返回空字符串而不是抛出异常,让处理继续 - return "" + # 返回错误JSON而不是空字符串 + import json + return json.dumps({ + "type": "brief_description", + "title": "Error", + "description": f"Chat completion failed: {str(e)}" + }) async def vision_completion(self, model_id: str, image_path: str, prompt: str, **kwargs) -> str: """ @@ -224,16 +234,48 @@ class MaxKBModelClient: vision_model = await self.get_vision_model(model_id) if not vision_model: - self.logger.warning(f"No vision model available for {model_id}, returning empty response") - return "" + self.logger.warning(f"No vision model available for {model_id}, returning error JSON") + # Return a valid JSON response instead of empty string + import json + return json.dumps({ + "type": "brief_description", + "title": "No Model", + "description": "Vision model not available" + }) - # 构造消息 + # 读取图片并转换为base64 + import base64 + import os + + if not os.path.exists(image_path): + self.logger.error(f"Image file not found: {image_path}") + import json + return json.dumps({ + "type": "brief_description", + "title": "File Error", + "description": f"Image file not found: {image_path}" + }) + + try: + with open(image_path, 'rb') as img_file: + image_data = img_file.read() + image_base64 = base64.b64encode(image_data).decode('utf-8') + except Exception as e: + self.logger.error(f"Failed to read/encode image {image_path}: {str(e)}") + import json + return json.dumps({ + "type": "brief_description", + "title": "Image Error", + "description": f"Failed to read/encode image: {str(e)}" + }) + + # 构造消息 - 使用base64编码的图片 messages = [ { "role": "user", "content": [ {"type": "text", "text": prompt}, - {"type": "image_url", "image_url": {"url": f"file://{image_path}"}} + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}} ] } ] @@ -251,8 +293,13 @@ class MaxKBModelClient: except Exception as e: self.logger.error(f"Vision completion failed: {str(e)}") - # 返回空字符串而不是抛出异常,让处理继续 - return "" + # 返回错误JSON而不是空字符串 + import json + return json.dumps({ + "type": "brief_description", + "title": "Vision Error", + "description": f"Vision completion failed: {str(e)}" + }) async def batch_chat_completion(self, model_id: str, batch_messages: List[List[Dict]], **kwargs) -> List[str]: """ diff --git a/apps/common/handle/impl/mineru/parallel_processor.py b/apps/common/handle/impl/mineru/parallel_processor.py index 9856cacd..74b7d1e5 100644 --- a/apps/common/handle/impl/mineru/parallel_processor.py +++ b/apps/common/handle/impl/mineru/parallel_processor.py @@ -610,16 +610,26 @@ class ParallelMinerUProcessor: xref = image_info.xref if xref in classification_results: result = classification_results[xref] - - # Apply meaningless filter if configured - if self.config.filter_meaningless_images and result.get('type') == 'meaningless': - self.logger.info(f"Recognizer: filtering out meaningless image {image_info.filename}") - # Still store the classification for reference - task.image_descriptions[image_info.filename] = result - else: - # Either filter is disabled or image is meaningful - meaningful_images.append(image_info) - task.image_descriptions[image_info.filename] = result + else: + # No classification result - likely an error occurred + self.logger.warning(f"Recognizer: no classification result for {image_info.filename}, creating default result") + result = { + 'type': 'meaningless', + 'content': 'Classification failed - no result returned', + 'input_tokens': 0, + 'output_tokens': 0, + 'error': 'No classification result' + } + + # Apply meaningless filter if configured + if self.config.filter_meaningless_images and result.get('type') == 'meaningless': + self.logger.info(f"Recognizer: filtering out meaningless image {image_info.filename}") + # Still store the classification for reference + task.image_descriptions[image_info.filename] = result + else: + # Either filter is disabled or image is meaningful + meaningful_images.append(image_info) + task.image_descriptions[image_info.filename] = result # Send to upload queue if there are images to upload # Note: if filter is disabled, we upload all classified images including meaningless ones @@ -977,25 +987,24 @@ class ParallelMinerUProcessor: f"images_count={len(task.images)}") if has_content and (has_no_images or has_processed_images): - # Integrate image descriptions into content before marking complete - if task.processed_images and task.image_descriptions: - self.logger.info(f"Page {task.page_idx + 1} ready for image integration:") - self.logger.info(f" - processed_images: {list(task.processed_images.keys())}") + # Integrate images into content if we have any image descriptions + # This ensures meaningless images are properly removed from content + if task.image_descriptions: + self.logger.info(f"Page {task.page_idx + 1} processing image integration:") + self.logger.info(f" - processed_images: {list(task.processed_images.keys()) if task.processed_images else 'None (filtered out)'}") self.logger.info(f" - image_descriptions: {list(task.image_descriptions.keys())}") self.logger.info(f" - content length before: {len(task.refined_content)} chars") task.refined_content = self._integrate_images_into_content( task.refined_content, task.image_descriptions, - task.processed_images, + task.processed_images or {}, # Pass empty dict if None f"{task.src_fileid}_page_{task.page_idx}" ) self.logger.info(f" - content length after: {len(task.refined_content)} chars") else: - self.logger.info(f"Page {task.page_idx + 1} has no images to integrate: " - f"processed_images={bool(task.processed_images)}, " - f"image_descriptions={bool(task.image_descriptions)}") + self.logger.info(f"Page {task.page_idx + 1} has no images to process") task.status = TaskStatus.COMPLETED should_mark_complete = True @@ -1032,96 +1041,109 @@ class ParallelMinerUProcessor: # Process each image description for filename, desc_info in image_descriptions.items(): self.logger.info(f"\nChecking image {filename} for replacement") - if filename in uploaded_images: - uploaded_url = uploaded_images[filename] - self.logger.info(f" - Found in uploaded_images: {uploaded_url}") - - # Extract the hash part from the filename - # This handles filenames like: 390561cb34fd3f951b1d25a252bead1c_page_1_44450601...jpg - # or mineru_image_xxx.png - base_filename = filename.replace('.png', '').replace('.jpg', '').replace('.jpeg', '') - - # Try to extract the hash part (usually the long hex string) - # For mineru images: mineru_image_XXX -> XXX - # For hash-based: XXX_page_N_YYY -> YYY (the last hash) - if 'mineru_image_' in filename: - ref = base_filename.replace('mineru_image_', '') - else: - # Look for the last hash-like pattern - parts = base_filename.split('_') - # Find the longest hex-like string - hash_parts = [p for p in parts if len(p) > 20 and all(c in '0123456789abcdef' for c in p)] - if hash_parts: - ref = hash_parts[-1] # Use the last hash - else: - ref = base_filename - - # Build replacement content based on image type - img_type = desc_info.get('type', 'brief_description') - title = desc_info.get('title', '') - description = desc_info.get('content', '') - ocr_content = desc_info.get('ocr_content', '') - - # Create the replacement markdown - if img_type == 'meaningless': - # Skip meaningless images - continue - elif img_type == 'structured_content': - # For structured content, include full description - replacement = f"\n\n![{title}]({uploaded_url})\n\n\n{ocr_content}\n\n" - else: - # For other types, use a simpler format - if description: - replacement = f"\n\n![{title}]({uploaded_url})\n\n" - else: - replacement = f"\n\n![{title}]({uploaded_url})\n\n" - - # Replace various possible image reference patterns - # We need to be flexible because the reference in content might be different from our filename - patterns = [] - - # If we found a hash reference, try to match it in various formats - if ref and len(ref) > 20: # Likely a hash - patterns.extend([ - f"!\\[.*?\\]\\(.*?{ref}.*?\\)", # Match hash anywhere in path - f"!\\[\\]\\(.*?{ref}.*?\\)", # Empty alt text with hash - f"!\\[.*?\\]\\(images/{ref}\\.[^)]+\\)", # images/hash.ext - f"!\\[\\]\\(images/{ref}\\.[^)]+\\)", # images/hash.ext with empty alt - ]) - - # Always try the full filename patterns - patterns.extend([ - f"!\\[.*?\\]\\(.*?{re.escape(filename)}\\)", # Match exact filename - f"!\\[.*?\\]\\(.*?{re.escape(base_filename)}\\)", # Match base filename - ]) - - # Add generic mineru image pattern if applicable - if 'mineru_image_' in filename: - patterns.append(f"!\\[.*?\\]\\(.*?{filename}\\)") - - self.logger.info(f" - extracted ref: '{ref}'") - self.logger.info(f" - trying {len(patterns)} patterns") - - replaced = False - for pattern in patterns: - new_content = re.sub(pattern, replacement, enhanced_content) - if new_content != enhanced_content: - enhanced_content = new_content - replaced = True - self.logger.info(f"Successfully replaced image {filename} using pattern: {pattern}") - break - - if not replaced: - self.logger.warning(f"Failed to replace image reference for: {filename}, ref={ref}") - # Log the first few characters of content to help debugging - sample = enhanced_content[:500] if len(enhanced_content) > 500 else enhanced_content - self.logger.info(f"Content sample: {sample}...") - # Also log the exact patterns we tried - self.logger.info(f"Tried patterns:") - for p in patterns: - self.logger.info(f" - {p}") + + # Get image type to determine if it's meaningless + img_type = desc_info.get('type', 'brief_description') + + # Process ALL images that have descriptions + # - Meaningless images: remove references (replace with empty string) + # - Images not uploaded but classified: also remove (likely filtered) + # - Uploaded images: replace with proper markdown + uploaded_url = uploaded_images.get(filename, '') + + if img_type == 'meaningless': + self.logger.info(f" - Image is meaningless, will remove references") + elif filename not in uploaded_images: + self.logger.info(f" - Image was classified as {img_type} but not uploaded (filtered), will remove references") + # Treat as meaningless for removal purposes + img_type = 'meaningless' else: - self.logger.warning(f"Image {filename} not found in uploaded_images!") + self.logger.info(f" - Found in uploaded_images: {uploaded_url}") + + # Extract the hash part from the filename + # This handles filenames like: 390561cb34fd3f951b1d25a252bead1c_page_1_44450601...jpg + # or mineru_image_xxx.png + base_filename = filename.replace('.png', '').replace('.jpg', '').replace('.jpeg', '') + + # Try to extract the hash part (usually the long hex string) + # For mineru images: mineru_image_XXX -> XXX + # For hash-based: XXX_page_N_YYY -> YYY (the last hash) + if 'mineru_image_' in filename: + ref = base_filename.replace('mineru_image_', '') + else: + # Look for the last hash-like pattern + parts = base_filename.split('_') + # Find the longest hex-like string + hash_parts = [p for p in parts if len(p) > 20 and all(c in '0123456789abcdef' for c in p)] + if hash_parts: + ref = hash_parts[-1] # Use the last hash + else: + ref = base_filename + + # Build replacement content based on image type + # img_type already extracted above + title = desc_info.get('title', '') + description = desc_info.get('content', '') + ocr_content = desc_info.get('ocr_content', '') + + # Create the replacement markdown + if img_type == 'meaningless': + # For meaningless images, we want to remove them entirely + replacement = "" # Empty string to remove the image reference + elif img_type == 'structured_content': + # For structured content, include full description + replacement = f"\n\n![{title}]({uploaded_url})\n\n\n{ocr_content}\n\n" + else: + # For other types, use a simpler format + if description: + replacement = f"\n\n![{title}]({uploaded_url})\n\n" + else: + replacement = f"\n\n![{title}]({uploaded_url})\n\n" + + # Replace various possible image reference patterns + # We need to be flexible because the reference in content might be different from our filename + patterns = [] + + # If we found a hash reference, try to match it in various formats + if ref and len(ref) > 20: # Likely a hash + patterns.extend([ + f"!\\[.*?\\]\\(.*?{ref}.*?\\)", # Match hash anywhere in path + f"!\\[\\]\\(.*?{ref}.*?\\)", # Empty alt text with hash + f"!\\[.*?\\]\\(images/{ref}\\.[^)]+\\)", # images/hash.ext + f"!\\[\\]\\(images/{ref}\\.[^)]+\\)", # images/hash.ext with empty alt + ]) + + # Always try the full filename patterns + patterns.extend([ + f"!\\[.*?\\]\\(.*?{re.escape(filename)}\\)", # Match exact filename + f"!\\[.*?\\]\\(.*?{re.escape(base_filename)}\\)", # Match base filename + ]) + + # Add generic mineru image pattern if applicable + if 'mineru_image_' in filename: + patterns.append(f"!\\[.*?\\]\\(.*?{filename}\\)") + + self.logger.info(f" - extracted ref: '{ref}'") + self.logger.info(f" - trying {len(patterns)} patterns") + + replaced = False + for pattern in patterns: + new_content = re.sub(pattern, replacement, enhanced_content) + if new_content != enhanced_content: + enhanced_content = new_content + replaced = True + self.logger.info(f"Successfully replaced image {filename} using pattern: {pattern}") + break + + if not replaced: + self.logger.warning(f"Failed to replace image reference for: {filename}, ref={ref}") + # Log the first few characters of content to help debugging + sample = enhanced_content[:500] if len(enhanced_content) > 500 else enhanced_content + self.logger.info(f"Content sample: {sample}...") + # Also log the exact patterns we tried + self.logger.info(f"Tried patterns:") + for p in patterns: + self.logger.info(f" - {p}") return enhanced_content diff --git a/apps/common/handle/impl/mineru/prompts/image_classification.py b/apps/common/handle/impl/mineru/prompts/image_classification.py index 17d36cbb..47908f50 100644 --- a/apps/common/handle/impl/mineru/prompts/image_classification.py +++ b/apps/common/handle/impl/mineru/prompts/image_classification.py @@ -56,18 +56,6 @@ IMAGE_CLASSIFICATION_CONTEXT_BASE = IMAGE_CLASSIFICATION_BASE + """ - description字段请控制在100-200字以内,需要: - 解释图片与周围文本的关系 - 说明图片在文档中的作用 -- context_relevance 字段,表示图片与上下文的相关性(high/medium/low) - -# 输出格式: -```json -{ - "type": "分类类型", - "title": "简短标题", - "description": "详细描述", - "ocr_content": "提取的文字内容(如适用)", - "context_relevance": "相关性等级" -} -``` # 上下文信息: """ diff --git a/apps/maxkb/urls.py b/apps/maxkb/urls.py index 166ea574..b4309b48 100644 --- a/apps/maxkb/urls.py +++ b/apps/maxkb/urls.py @@ -72,6 +72,65 @@ def pro(): if not settings.DEBUG: pro() +# 添加storage路由 - 使用函数视图避免类导入问题 +def serve_storage_file(request, file_path): + """ + 直接提供storage目录下的文件访问 + """ + import os + import mimetypes + from django.http import HttpResponse, Http404 + from django.utils.encoding import escape_uri_path + + # 基础存储路径 - 支持本地开发和Docker环境 + base_path = os.getenv('MAXKB_STORAGE_PATH', '/opt/maxkb/storage') + # 如果是本地开发环境,使用相对路径 + if not os.path.exists(base_path): + base_path = './tmp/maxkb/storage' + full_path = os.path.join(base_path, file_path) + + # 安全检查 + try: + real_base = os.path.realpath(base_path) + real_path = os.path.realpath(full_path) + if not real_path.startswith(real_base): + raise Http404("File not found") + except (OSError, ValueError): + raise Http404("File not found") + + # 检查文件是否存在 + if not os.path.exists(full_path) or not os.path.isfile(full_path): + raise Http404("File not found") + + # 读取文件 + try: + with open(full_path, 'rb') as f: + file_content = f.read() + except IOError: + raise Http404("File not found") + + # 获取MIME类型 + content_type, _ = mimetypes.guess_type(full_path) + if not content_type: + content_type = 'application/octet-stream' + + # 构建响应 + response = HttpResponse(file_content, content_type=content_type) + + # 设置响应头 + file_name = os.path.basename(full_path) + if content_type.startswith('image/'): + response['Content-Disposition'] = f'inline; filename="{escape_uri_path(file_name)}"' + response['Cache-Control'] = 'public, max-age=2592000' # 30天缓存 + else: + response['Content-Disposition'] = f'attachment; filename="{escape_uri_path(file_name)}"' + response['Cache-Control'] = 'public, max-age=86400' # 1天缓存 + + return response + +# 添加storage路由 +urlpatterns.insert(0, re_path(r'^storage/(?P.*)$', serve_storage_file, name='storage_file')) + def get_index_html(index_path): file = open(index_path, "r", encoding='utf-8') diff --git a/apps/oss/urls.py b/apps/oss/urls.py index f344049f..d07eb3e6 100644 --- a/apps/oss/urls.py +++ b/apps/oss/urls.py @@ -6,4 +6,5 @@ app_name = 'oss' urlpatterns = [ path('oss/file', views.FileView.as_view()), + # storage路由已移至主URL配置中 ] diff --git a/apps/oss/views/__init__.py b/apps/oss/views/__init__.py index 1082199e..0b4667a5 100644 --- a/apps/oss/views/__init__.py +++ b/apps/oss/views/__init__.py @@ -1 +1,2 @@ -from .file import * \ No newline at end of file +from .file import * +from .storage import StorageFileView \ No newline at end of file diff --git a/apps/oss/views/storage.py b/apps/oss/views/storage.py new file mode 100644 index 00000000..903fd874 --- /dev/null +++ b/apps/oss/views/storage.py @@ -0,0 +1,87 @@ +# coding=utf-8 +""" +Storage file service for MinerU parsed images +""" +import os +import mimetypes +from pathlib import Path + +from django.http import HttpResponse, Http404 +from django.utils.encoding import escape_uri_path +from django.views import View + + +class StorageFileView(View): + """ + 静态文件服务视图,用于提供MinerU解析后的图片访问 + 使用Django基础View类,完全不涉及认证系统 + """ + + def get(self, request, file_path: str): + """ + 获取存储的文件 + + Args: + request: HTTP请求 + file_path: 文件相对路径(如:mineru/images/xxx.jpg) + + Returns: + 文件内容或404错误 + """ + # 基础存储路径(从环境变量读取,默认为/opt/maxkb/storage) + base_path = os.getenv('MAXKB_STORAGE_PATH', '/opt/maxkb/storage') + + # 构建完整文件路径 + full_path = os.path.join(base_path, file_path) + + # 安全检查:确保请求的路径在base_path内 + try: + # 规范化路径,解析符号链接等 + real_base = os.path.realpath(base_path) + real_path = os.path.realpath(full_path) + + # 确保文件路径在基础路径内(防止路径遍历攻击) + if not real_path.startswith(real_base): + raise Http404("File not found") + except (OSError, ValueError): + raise Http404("File not found") + + # 检查文件是否存在 + if not os.path.exists(full_path) or not os.path.isfile(full_path): + raise Http404("File not found") + + # 读取文件内容 + try: + with open(full_path, 'rb') as f: + file_content = f.read() + except IOError: + raise Http404("File not found") + + # 获取文件MIME类型 + content_type, _ = mimetypes.guess_type(full_path) + if not content_type: + content_type = 'application/octet-stream' + + # 构建响应 + response = HttpResponse(file_content, content_type=content_type) + + # 设置文件名(用于下载) + file_name = os.path.basename(full_path) + # 对于图片类型,使用inline显示;其他类型使用attachment下载 + if content_type.startswith('image/'): + disposition = 'inline' + else: + disposition = 'attachment' + + # 使用escape_uri_path处理文件名中的特殊字符 + response['Content-Disposition'] = f'{disposition}; filename="{escape_uri_path(file_name)}"' + + # 设置缓存控制(图片可以缓存较长时间) + if content_type.startswith('image/'): + # 图片缓存30天 + response['Cache-Control'] = 'public, max-age=2592000' + else: + # 其他文件缓存1天 + response['Cache-Control'] = 'public, max-age=86400' + + return response \ No newline at end of file diff --git a/dev/docker-compose-simple.yml b/dev/docker-compose-simple.yml index ee6a7beb..685e57fa 100644 --- a/dev/docker-compose-simple.yml +++ b/dev/docker-compose-simple.yml @@ -15,12 +15,50 @@ services: - ../tmp:/tmp # 数据持久化 - ~/.maxkb:/opt/maxkb + # MinerU图片存储目录持久化 + - ~/.maxkb/storage:/opt/maxkb/storage:rw environment: # 开启调试模式 DJANGO_DEBUG: "True" PYTHONUNBUFFERED: "1" MAXKB_LOG_LEVEL: "DEBUG" + + # MinerU 配置 + MINERU_PARSER_CACHE: "True" + MINERU_MULTIMODAL_REFINEMENT: "True" + # MinerU 图片存储路径 + MAXKB_STORAGE_PATH: "/opt/maxkb/storage" + MINERU_API_TYPE: "self_hosted" MINERU_API_URL: "http://mineru:8000" + + # MINERU_API_TYPE: "cloud" + # MINERU_API_URL: "https://mineru.net" + # MINERU_API_KEY: "eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiI2OTYwMDEwNiIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc1NTE2MzQ5OCwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiIiwib3BlbklkIjpudWxsLCJ1dWlkIjoiYTQwODk5NjMtNDI1OS00MWM3LWE3NWItY2IzZTQ4NTRjYWIwIiwiZW1haWwiOiIiLCJleHAiOjE3NTYzNzMwOTh9.88m9JSKQhkwJ557jCTFOgmdjeAfpXzxy2QDINaJ0rfHfnMNBxQt47aHr2jABeuxW-fXm8S5AO7zWWTXEGx8BxA" + + # 配置队列大小 + MINERU_QUEUE_SIZE: "50" + # 配置处理超时时间(秒) + MINERU_PROCESSING_TIMEOUT: "7200" + # 配置各线程数量(支持多线程) + MINERU_PARSER_THREADS: "1" + MINERU_REFINER_THREADS: "3" + MINERU_RECOGNIZER_THREADS: "3" + MINERU_UPLOADER_THREADS: "1" + + MINERU_BATCH_PROCESSING: "true" + MINERU_BATCH_SIZE: "10" + # 启用/禁用过滤 + MINERU_SKIP_SMALL_IMAGES: "true" + # 每页最多图片数 + MINERU_MAX_IMAGES_PER_PAGE: "10" + # 文档最多图片数 + MINERU_MAX_IMAGES_PER_DOCUMENT: "200" + # 最小图片尺寸(像素) + MINERU_MIN_IMAGE_SIZE: "10000" + # 最大图片尺寸(像素) + MINERU_MAX_IMAGE_SIZE: "10000000" + # 过滤无意义图片 + MINERU_FILTER_MEANINGLESS: "true" restart: unless-stopped # 使用镜像默认的启动命令 diff --git a/installer/Dockerfile b/installer/Dockerfile index fc4c2223..eb866118 100644 --- a/installer/Dockerfile +++ b/installer/Dockerfile @@ -75,6 +75,11 @@ ENV MAXKB_VERSION="${DOCKER_IMAGE_TAG} (build at ${BUILD_AT}, commit: ${GITHUB_C PATH=/opt/py3/bin:$PATH \ PIP_TARGET=/opt/maxkb/python-packages +# Install poppler-utils for PDF processing (required by MinerU) +RUN apt-get update && \ + apt-get install -y --no-install-recommends poppler-utils && \ + apt-get clean all && \ + rm -rf /var/lib/apt/lists/* WORKDIR /opt/maxkb-app COPY --from=stage-build /opt/maxkb-app /opt/maxkb-app diff --git a/test_image_access.py b/test_image_access.py new file mode 100644 index 00000000..227739a1 --- /dev/null +++ b/test_image_access.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +""" +测试图片存储和访问 + +这个脚本会: +1. 创建一个测试图片在存储目录 +2. 打印正确的访问URL +""" + +import os +import sys + +def main(): + # 设置存储路径(本地开发环境) + storage_path = os.getenv('MAXKB_STORAGE_PATH', './tmp/maxkb/storage') + + print("=" * 60) + print("MaxKB 图片存储和访问测试") + print("=" * 60) + + # 创建目录结构 + image_dir = os.path.join(storage_path, 'mineru', 'images') + os.makedirs(image_dir, exist_ok=True) + print(f"\n1. 存储目录:{image_dir}") + + # 创建测试图片文件 + test_image = os.path.join(image_dir, 'ac3681aaa7a346b49ef9c7ceb7b94058.jpg') + with open(test_image, 'wb') as f: + # 写入一个简单的测试内容(实际应该是图片二进制数据) + f.write(b'TEST IMAGE CONTENT') + print(f"2. 创建测试文件:{test_image}") + + # 生成访问URL + print("\n3. 访问URL:") + print(f" 本地开发:http://localhost:8080/storage/mineru/images/ac3681aaa7a346b49ef9c7ceb7b94058.jpg") + print(f" Docker环境:http://localhost:8080/storage/mineru/images/ac3681aaa7a346b49ef9c7ceb7b94058.jpg") + + # 列出当前存储目录的所有文件 + print(f"\n4. 存储目录内容:") + for root, dirs, files in os.walk(storage_path): + level = root.replace(storage_path, '').count(os.sep) + indent = ' ' * level + print(f'{indent}{os.path.basename(root)}/') + subindent = ' ' * (level + 1) + for file in files: + file_path = os.path.join(root, file) + file_size = os.path.getsize(file_path) + print(f'{subindent}{file} ({file_size} bytes)') + + print("\n" + "=" * 60) + print("测试完成!") + print("\n注意事项:") + print("1. 确保Django服务器正在运行") + print("2. URL路径现在是 /storage/ 开头,简洁直接") + print("3. 如果使用Docker,确保volume正确挂载") + print("=" * 60) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/test_storage.py b/test_storage.py new file mode 100644 index 00000000..84feeefe --- /dev/null +++ b/test_storage.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python +""" +测试MinerU图片存储和访问功能 + +使用方法: +1. 在本地开发环境:python test_storage.py +2. 在Docker环境:docker exec -it maxkb-dev python /opt/maxkb-app/test_storage.py +""" + +import os +import sys +import tempfile +import shutil +from pathlib import Path + +def test_storage(): + """测试存储功能""" + print("=" * 60) + print("MinerU 图片存储测试") + print("=" * 60) + + # 1. 检查存储路径配置 + storage_path = os.getenv('MAXKB_STORAGE_PATH', '/opt/maxkb/storage') + print(f"\n1. 存储路径配置:{storage_path}") + + # 2. 创建测试目录结构 + test_dir = os.path.join(storage_path, 'test', 'images') + print(f"\n2. 创建测试目录:{test_dir}") + os.makedirs(test_dir, exist_ok=True) + + # 3. 创建测试图片文件 + test_image_path = os.path.join(test_dir, 'test_image.txt') + print(f"\n3. 创建测试文件:{test_image_path}") + with open(test_image_path, 'w') as f: + f.write("This is a test image file for MinerU storage") + + # 4. 验证文件创建 + if os.path.exists(test_image_path): + print(" ✓ 文件创建成功") + file_size = os.path.getsize(test_image_path) + print(f" 文件大小:{file_size} bytes") + else: + print(" ✗ 文件创建失败") + return False + + # 5. 生成访问URL + relative_path = os.path.relpath(test_image_path, storage_path) + access_url = f"/api/storage/{relative_path}" + print(f"\n4. 生成的访问URL:{access_url}") + + # 6. 列出存储目录内容 + print(f"\n5. 存储目录内容:") + for root, dirs, files in os.walk(storage_path): + level = root.replace(storage_path, '').count(os.sep) + indent = ' ' * 2 * level + print(f'{indent}{os.path.basename(root)}/') + subindent = ' ' * 2 * (level + 1) + for file in files: + print(f'{subindent}{file}') + + print("\n" + "=" * 60) + print("测试完成!") + print("\n配置建议:") + print("1. 确保Docker volume正确挂载:~/.maxkb/storage:/opt/maxkb/storage") + print("2. 确保环境变量设置:MAXKB_STORAGE_PATH=/opt/maxkb/storage") + print("3. 访问图片URL格式:http://localhost:8080/api/storage/mineru/images/xxx.jpg") + print("=" * 60) + + return True + +def test_mineru_adapter(): + """测试MinerU适配器""" + print("\n" + "=" * 60) + print("测试MinerU适配器") + print("=" * 60) + + # 添加apps目录到Python路径 + sys.path.insert(0, '/opt/maxkb-app/apps' if os.path.exists('/opt/maxkb-app/apps') else './apps') + + try: + from common.handle.impl.mineru.maxkb_adapter.adapter import MaxKBAdapter + + print("\n1. 创建MaxKB适配器实例") + adapter = MaxKBAdapter() + print(f" 存储路径:{adapter.storage_path}") + + # 创建临时测试文件 + with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp: + tmp.write(b"Test image content") + tmp_path = tmp.name + + print(f"\n2. 测试upload_file方法") + print(f" 源文件:{tmp_path}") + + # 使用异步方式调用 + import asyncio + async def test_upload(): + result = await adapter.upload_file(tmp_path, options=['test_knowledge']) + return result + + # 运行异步测试 + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + result_url = loop.run_until_complete(test_upload()) + print(f" 返回URL:{result_url}") + + # 清理临时文件 + os.unlink(tmp_path) + + print("\n✓ MinerU适配器测试成功") + + except ImportError as e: + print(f"\n✗ 无法导入MinerU适配器:{e}") + print(" 请确保在MaxKB环境中运行此测试") + except Exception as e: + print(f"\n✗ 测试失败:{e}") + import traceback + traceback.print_exc() + +if __name__ == "__main__": + # 运行存储测试 + if test_storage(): + # 如果基础存储测试成功,尝试测试适配器 + try: + test_mineru_adapter() + except: + print("\n提示:适配器测试需要在MaxKB环境中运行") \ No newline at end of file diff --git a/test_storage_simple.py b/test_storage_simple.py new file mode 100644 index 00000000..ecf2958f --- /dev/null +++ b/test_storage_simple.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +""" +简单的存储测试 - 创建测试图片 +""" +import os + +# 创建存储目录 +storage_path = './tmp/maxkb/storage/mineru/images' +os.makedirs(storage_path, exist_ok=True) + +# 创建测试图片(实际是一个文本文件,但后缀是.jpg) +test_file = os.path.join(storage_path, 'ac3681aaa7a346b49ef9c7ceb7b94058.jpg') +with open(test_file, 'wb') as f: + # 写入一个最小的JPEG文件头(这样浏览器会识别为图片) + # FF D8 FF E0 是JPEG文件的魔术数字 + f.write(bytes.fromhex('FFD8FFE000104A46494600010101006000600000FFDB004300080606070605080707070909080A0C140D0C0B0B0C1912130F141D1A1F1E1D1A1C1C20242E2720222C231C1C2837292C30313434341F27393D38323C2E333432FFDB0043010909090C0B0C180D0D1832211C2132323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232FFC00011080001000103012200021101031101FFC4001F0000010501010101010100000000000000000102030405060708090A0BFFC400B5100002010303020403050504040000017D01020300041105122131410613516107227114328191A1082342B1C11552D1F02433627282090A161718191A25262728292A3435363738393A434445464748494A535455565758595A636465666768696A737475767778797A838485868788898A92939495969798999AA2A3A4A5A6A7A8A9AAB2B3B4B5B6B7B8B9BAC2C3C4C5C6C7C8C9CAD2D3D4D5D6D7D8D9DAE1E2E3E4E5E6E7E8E9EAF1F2F3F4F5F6F7F8F9FAFFC4001F0100030101010101010101010000000000000102030405060708090A0BFFC400B51100020102040403040705040400010277000102031104052131061241510761711322328108144291A1B1C109233352F0156272D10A162434E125F11718191A262728292A35363738393A434445464748494A535455565758595A636465666768696A737475767778797A82838485868788898A92939495969798999AA2A3A4A5A6A7A8A9AAB2B3B4B5B6B7B8B9BAC2C3C4C5C6C7C8C9CAD2D3D4D5D6D7D8D9DAE2E3E4E5E6E7E8E9EAF2F3F4F5F6F7F8F9FAFFDA000C03010002110311003F00F9FFD9')) + +print(f"测试文件已创建:{test_file}") +print(f"文件大小:{os.path.getsize(test_file)} bytes") +print("\n访问URL:") +print("http://localhost:8080/storage/mineru/images/ac3681aaa7a346b49ef9c7ceb7b94058.jpg") +print("\n如果Django服务正在运行,可以直接在浏览器中访问上述URL") \ No newline at end of file