add mineru

This commit is contained in:
朱潮 2025-08-24 17:45:40 +08:00
parent 35f9a4dbfe
commit f0263bf189
20 changed files with 845 additions and 184 deletions

61
MINERU_STORAGE_README.md Normal file
View File

@ -0,0 +1,61 @@
# MinerU 图片存储配置说明
## 访问URL格式
MinerU解析后的图片访问URL格式为
```
http://localhost:8080/storage/mineru/images/xxx.jpg
```
简洁直接,直接使用 `/storage/` 路径访问
## 存储路径配置
### 本地开发环境
- 存储路径:`./tmp/maxkb/storage/`
- 图片位置:`./tmp/maxkb/storage/mineru/images/`
### Docker环境
- 容器内路径:`/opt/maxkb/storage/`
- 本地映射路径:`~/.maxkb/storage/`
- 图片位置:`~/.maxkb/storage/mineru/images/`
## 环境变量配置
`.env` 文件或 docker-compose.yml 中添加:
```bash
MAXKB_STORAGE_PATH=/opt/maxkb/storage
```
## Docker Volume配置
`docker-compose.yml` 中已配置:
```yaml
volumes:
- ~/.maxkb/storage:/opt/maxkb/storage:rw
```
## 测试访问
1. 运行测试脚本创建测试图片:
```bash
python test_image_access.py
```
2. 访问测试URL
```
http://localhost:8080/storage/mineru/images/ac3681aaa7a346b49ef9c7ceb7b94058.jpg
```
## 故障排查
1. **404错误**:检查文件是否存在于存储目录
2. **权限错误**:确保存储目录有写入权限
3. **路径错误**确认URL路径以 `/storage/` 开头
## 相关文件
- 存储视图:`apps/oss/views/storage.py`
- URL配置`apps/oss/urls.py`
- MinerU适配器`apps/common/handle/impl/mineru/maxkb_adapter/adapter.py`
- 配置文件:`apps/common/handle/impl/mineru/maxkb_adapter/config_maxkb.py`

View File

@ -268,12 +268,35 @@ class BaseMinerUExtractor:
self.logger.info(f"mineru-parser: found {len(cached_pages)} cached pages")
# 获取上传回调(通过适配器)
upload_callback = lambda file_path: self.adapter.upload_file(file_path, upload_options)
# 总是创建upload_callback让适配器决定如何处理
# upload_func 需要接受4个参数: filepath, filename, upload_options, binary_data
async def upload_callback(filepath, filename, options, binary_data=None):
# 如果有 binary_data说明图片被压缩了需要先保存到临时文件
if binary_data:
import tempfile
with tempfile.NamedTemporaryFile(suffix=os.path.splitext(filename)[1], delete=False) as tmp:
tmp.write(binary_data)
tmp_path = tmp.name
result = await self.adapter.upload_file(tmp_path, upload_options)
# 清理临时文件
try:
os.unlink(tmp_path)
except:
pass
elif filepath:
result = await self.adapter.upload_file(filepath, upload_options)
else:
self.logger.warning(f"No file path or binary data provided for {filename}")
return None, None
# 返回 (url, upload_key) 格式
return result, None
# 并行处理文档
completed_tasks = await self.parallel_processor.process_document_with_cache(
pdf_path, temp_dir, src_fileid, is_ppt_format,
len(pages_info), upload_callback if upload_options else None, upload_options,
len(pages_info), upload_callback, upload_options,
cached_pages=cached_pages,
save_callback=lambda idx, data: self._save_page_cache(temp_dir, idx, data)
)

View File

@ -45,6 +45,11 @@ class GPTBaseAdapter(PlatformAdapter):
async def upload_file(self, file_path: str, options: Any = None) -> str:
"""上传文件 - PDF使用S3特殊处理图片使用gzero上传"""
# 如果在测试模式下,直接返回原图地址
if os.getenv('MINERU_TEST_FILE'):
logger.info(f"Gbase: Test mode - returning original path: {file_path}")
return file_path
import os
# 判断文件类型

View File

@ -224,10 +224,10 @@ class ImageOptimizer:
upload_options,
max_retries: int = 3,
retry_delay: float = 1.0) -> Tuple[Optional[str], Optional[str]]:
if os.getenv('MINERU_TEST_FILE'):
return image_info.filepath, None
# return image_info.filepath, None
"""上传单个图片(带并发控制和重试机制)"""
# 注释掉测试模式,让上传回调能够被调用
# if os.getenv('MINERU_TEST_FILE'):
# return image_info.filepath, None
async with self.upload_semaphore:
# 处理图片
image_data, hash_value = await self.process_image_for_upload(image_info)
@ -344,8 +344,9 @@ class ImageOptimizer:
async with self.api_semaphore:
try:
# 调用分类函数
# classify_func expects: (learn_type, image_filepath, temp_dir, src_name, hint)
return await classify_func(
vision_model,
vision_model, # This is actually learn_type
img_data['image_info'].filepath,
temp_dir,
src_name,

View File

@ -517,11 +517,16 @@ class MinerUImageProcessor:
This is an enhanced version that uses context when available.
"""
self.logger.info(f"mineru-image: _classify_single_image_with_context called for {os.path.basename(image_filepath)}")
# If no context, fall back to original method
if not context:
self.logger.info(f"mineru-image: no context, falling back to original method")
return await self._classify_single_image(learn_type, image_filepath, temp_dir, src_name, hint)
try:
self.logger.info(f"mineru-image: processing with context for {os.path.basename(image_filepath)}")
if not os.path.exists(image_filepath):
raise FileNotFoundError(f"Image file not found: {image_filepath}")
@ -535,6 +540,9 @@ class MinerUImageProcessor:
# Build context-aware prompt with language
prompt = self._build_context_aware_prompt(context, language_code)
# Log the final prompt for debugging
self.logger.info(f"mineru-image: Final prompt for {os.path.basename(image_filepath)}:\n{prompt[:1000]}...")
messages = [
{'role': 'system', 'content': prompt},
{'role': 'user', 'content': [
@ -549,19 +557,34 @@ class MinerUImageProcessor:
start_time = time.time()
try:
self.logger.info(f"mineru-image: calling vision model for {os.path.basename(image_filepath)}")
response = await self.config.call_litellm(
model_type=learn_type,
messages=messages,
temperature=0.0,
timeout=120.0 # Increased timeout to 120 seconds for vision models
)
self.logger.info(f"mineru-image: received response from vision model")
duration = time.time() - start_time
# Log raw response for debugging
raw_response = response.choices[0].message.content if response.choices else ""
self.logger.info(f"mineru-image: raw AI response (first 500 chars): {raw_response[:500] if raw_response else 'Empty response'}")
# Log complete response for debugging
self.logger.info(f"mineru-image: FULL AI response for {os.path.basename(image_filepath)}:\n{raw_response}")
# Log usage info
if hasattr(response, 'usage'):
self.logger.info(f"mineru-image: usage - prompt_tokens={getattr(response.usage, 'prompt_tokens', 0)}, "
f"completion_tokens={getattr(response.usage, 'completion_tokens', 0)}")
else:
self.logger.warning(f"mineru-image: no usage info in response")
# Parse enhanced response
result = self._parse_context_aware_response(
response.choices[0].message.content,
response.usage,
raw_response,
response.usage if hasattr(response, 'usage') else None,
duration
)
@ -569,15 +592,21 @@ class MinerUImageProcessor:
result['has_context'] = True
result['page_idx'] = context.page_idx
# Log successful classification
self.logger.info(f"mineru-image: classified {os.path.basename(image_filepath)} as {result.get('type', 'unknown')} "
f"(tokens: in={result.get('input_tokens', 0)}, out={result.get('output_tokens', 0)})")
except Exception as e:
self.logger.error(f"mineru-image: classification error: {str(e)}")
self.logger.info(f"mineru-image: classification failed for {os.path.basename(image_filepath)}, returning meaningless")
result = {
'type': 'meaningless',
'content': f'Classification error: {str(e)}',
'input_tokens': 0,
'output_tokens': 0,
'dura': time.time() - start_time,
'has_context': True
'has_context': True,
'error': str(e)
}
return result
@ -652,13 +681,16 @@ class MinerUImageProcessor:
# Parse JSON
result_json = json.loads(response_content)
# Log the raw classification response for debugging
self.logger.info(f"mineru-image: parsed JSON response: {result_json}")
# Build result dictionary
result = {
'type': result_json.get('type', 'meaningless'),
'title': result_json.get('title', ''),
'content': result_json.get('description', ''),
'input_tokens': usage.prompt_tokens if hasattr(usage, 'prompt_tokens') else 0,
'output_tokens': usage.completion_tokens if hasattr(usage, 'completion_tokens') else 0,
'input_tokens': usage.prompt_tokens if usage and hasattr(usage, 'prompt_tokens') else 0,
'output_tokens': usage.completion_tokens if usage and hasattr(usage, 'completion_tokens') else 0,
'dura': duration
}
@ -670,13 +702,14 @@ class MinerUImageProcessor:
except Exception as e:
self.logger.error(f"mineru-image: failed to parse context response: {str(e)}")
self.logger.debug(f"mineru-image: response that failed to parse: {response_content[:500] if response_content else 'Empty'}")
# Return a basic result
return {
'type': 'brief_description',
'title': '',
'content': response_content[:200] if response_content else '',
'input_tokens': usage.prompt_tokens if hasattr(usage, 'prompt_tokens') else 0,
'output_tokens': usage.completion_tokens if hasattr(usage, 'completion_tokens') else 0,
'input_tokens': usage.prompt_tokens if usage and hasattr(usage, 'prompt_tokens') else 0,
'output_tokens': usage.completion_tokens if usage and hasattr(usage, 'completion_tokens') else 0,
'dura': duration
}
@ -736,6 +769,9 @@ class MinerUImageProcessor:
# Parse response
response_content = response.choices[0].message.content
# Log complete response for debugging
self.logger.info(f"mineru-image: FULL AI response for {os.path.basename(image_filepath)} (no context):\n{response_content}")
# Extract JSON from markdown code block if present
if '```json' in response_content and '```' in response_content:
try:
@ -797,7 +833,8 @@ class MinerUImageProcessor:
'dura': time.time() - start_time,
}
self.logger.info(f"mineru-image: classified {image_filepath} as {result.get('type', 'unknown')}")
# Enhanced logging to debug meaningless classification
self.logger.info(f"mineru-image: classified {image_filepath} as {result.get('type', 'unknown')} - tokens: in={result.get('input_tokens', 0)}, out={result.get('output_tokens', 0)}, error={result.get('error', 'None')}")
return result

View File

@ -83,7 +83,10 @@ def get_module_logger(module_name):
pass
# 默认情况创建独立的logger
return logging.getLogger(module_name)
module_logger = logging.getLogger(module_name)
# 确保使用正确的日志级别
module_logger.setLevel(logging.INFO)
return module_logger
# 为了兼容性导出默认logger
logger = get_module_logger('mineru')

View File

@ -30,6 +30,11 @@ class MaxKBAdapter(PlatformAdapter):
self.file_storage = FileStorageClient()
self.model_client = maxkb_model_client
# 导入配置以获取存储路径
from .config_maxkb import MaxKBMinerUConfig
self.config = MaxKBMinerUConfig()
self.storage_path = self.config.file_storage_path
@contextlib.asynccontextmanager
async def trace_context(self, trace_id: str):
"""MaxKB的trace上下文 - 如果没有特殊实现,使用简单的上下文"""
@ -51,19 +56,68 @@ class MaxKBAdapter(PlatformAdapter):
logger.debug(f"MaxKB: Released lock for {temp_dir}")
async def upload_file(self, file_path: str, options: Any = None) -> str:
"""使用MaxKB的文件存储上传文件"""
"""使用MaxKB的文件存储上传文件 - 直接复制文件到存储目录"""
import shutil
import uuid
logger.info(f"MaxKB: upload_file called with path={file_path}, options={options}")
# 如果在测试模式下,直接返回原图地址
if os.getenv('MINERU_TEST_FILE'):
logger.info(f"MaxKB: Test mode - returning original path: {file_path}")
return file_path
#if os.getenv('MINERU_TEST_FILE'):
# logger.info(f"MaxKB: Test mode - returning original path: {file_path}")
# return file_path
try:
# 使用MaxKB的文件存储客户端上传
uploaded_url = await self.file_storage.upload_image(file_path)
return uploaded_url
# 确保文件存在
if not os.path.exists(file_path):
logger.warning(f"MaxKB: File not found: {file_path}")
return file_path
# 获取knowledge_id如果在options中提供
knowledge_id = None
if options and isinstance(options, (tuple, list)) and len(options) > 0:
knowledge_id = options[0]
# 创建存储目录结构
# 使用 knowledge_id 或 'mineru' 作为子目录
sub_dir = knowledge_id if knowledge_id else 'mineru'
storage_dir = os.path.join(self.storage_path, sub_dir, 'images')
# 确保存储目录存在
os.makedirs(storage_dir, exist_ok=True)
# 生成唯一的文件名,保留原始扩展名
file_ext = os.path.splitext(file_path)[1]
file_name = f"{uuid.uuid4().hex}{file_ext}"
dest_path = os.path.join(storage_dir, file_name)
# 复制文件到存储目录
shutil.copy2(file_path, dest_path)
# 返回相对路径或URL格式
# 生成相对于storage根目录的路径
relative_path = os.path.relpath(dest_path, self.storage_path)
# 确保路径使用正斜杠(兼容所有系统)
relative_path = relative_path.replace(os.path.sep, '/')
# 根据环境配置生成完整的URL
# 检查是否配置了基础URL
base_url = os.getenv('MAXKB_BASE_URL', '')
if base_url:
# 如果有基础URL生成完整的URL
result_url = f"{base_url.rstrip('/')}/storage/{relative_path}"
else:
# 生成相对URL直接使用/storage/路径
result_url = f"/storage/{relative_path}"
logger.info(f"MaxKB: Copied file {file_path} -> {dest_path}")
logger.debug(f"MaxKB: Returning URL: {result_url}")
return result_url
except Exception as e:
logger.error(f"MaxKB: Failed to upload file {file_path}: {str(e)}")
# 如果上传失败,返回本地路径
logger.error(f"MaxKB: Failed to copy file {file_path}: {str(e)}")
# 如果复制失败,返回本地路径
return file_path
def get_logger(self):

View File

@ -28,7 +28,7 @@ class MaxKBMinerUConfig(MinerUConfig):
# File storage settings
self.file_storage_type = os.getenv('MAXKB_STORAGE_TYPE', 'local') # local, s3, oss
self.file_storage_path = os.getenv('MAXKB_STORAGE_PATH', '/tmp/maxkb/storage')
self.file_storage_path = os.getenv('MAXKB_STORAGE_PATH', '/opt/maxkb/storage')
self.file_storage_bucket = os.getenv('MAXKB_STORAGE_BUCKET')
# Model client settings
@ -133,14 +133,23 @@ class MaxKBMinerUConfig(MinerUConfig):
# Call appropriate method based on content type
if has_images:
# Extract image and text for vision model
# Extract image and combine all text content for vision model
image_path = None
prompt = ""
combined_prompt = ""
# First, collect system message if exists
for msg in messages:
if msg.get('role') == 'system':
combined_prompt = msg.get('content', '') + "\n\n"
break
# Then extract user message content
for msg in messages:
if msg.get('role') == 'user':
if isinstance(msg.get('content'), list):
for content_item in msg['content']:
if content_item.get('type') == 'text':
prompt = content_item.get('text', '')
combined_prompt += content_item.get('text', '')
elif content_item.get('type') == 'image_url':
image_url = content_item.get('image_url', {})
if isinstance(image_url, dict):
@ -158,12 +167,14 @@ class MaxKBMinerUConfig(MinerUConfig):
image_path = tmp.name
else:
image_path = url
elif isinstance(msg.get('content'), str):
combined_prompt += msg.get('content', '')
if image_path:
response_text = await maxkb_model_client.vision_completion(
model_id=model_id,
image_path=image_path,
prompt=prompt,
prompt=combined_prompt,
**kwargs
)
else:
@ -215,8 +226,14 @@ class MaxKBMinerUConfig(MinerUConfig):
'total_tokens': 0
})()
# Return empty response on error to continue processing
return MockResponse("")
# Return a valid JSON response on error to prevent parsing issues
# This will be parsed as a brief_description type
error_response = json.dumps({
"type": "brief_description",
"title": "Error",
"description": f"Model call failed: {str(e)}"
})
return MockResponse(error_response)
def _get_default_llm_model_id(self) -> str:
"""获取默认的LLM模型ID"""

View File

@ -187,8 +187,13 @@ class MaxKBModelClient:
llm_model = await self.get_llm_model(model_id)
if not llm_model:
self.logger.warning(f"No model available for {model_id}, returning empty response")
return ""
self.logger.warning(f"No model available for {model_id}, returning error JSON")
import json
return json.dumps({
"type": "brief_description",
"title": "No Model",
"description": "LLM model not available"
})
# 调用模型 - 使用 sync_to_async 包装同步调用
response = await sync_to_async(llm_model.invoke)(messages)
@ -203,8 +208,13 @@ class MaxKBModelClient:
except Exception as e:
self.logger.error(f"Chat completion failed: {str(e)}")
# 返回空字符串而不是抛出异常,让处理继续
return ""
# 返回错误JSON而不是空字符串
import json
return json.dumps({
"type": "brief_description",
"title": "Error",
"description": f"Chat completion failed: {str(e)}"
})
async def vision_completion(self, model_id: str, image_path: str, prompt: str, **kwargs) -> str:
"""
@ -224,16 +234,48 @@ class MaxKBModelClient:
vision_model = await self.get_vision_model(model_id)
if not vision_model:
self.logger.warning(f"No vision model available for {model_id}, returning empty response")
return ""
self.logger.warning(f"No vision model available for {model_id}, returning error JSON")
# Return a valid JSON response instead of empty string
import json
return json.dumps({
"type": "brief_description",
"title": "No Model",
"description": "Vision model not available"
})
# 构造消息
# 读取图片并转换为base64
import base64
import os
if not os.path.exists(image_path):
self.logger.error(f"Image file not found: {image_path}")
import json
return json.dumps({
"type": "brief_description",
"title": "File Error",
"description": f"Image file not found: {image_path}"
})
try:
with open(image_path, 'rb') as img_file:
image_data = img_file.read()
image_base64 = base64.b64encode(image_data).decode('utf-8')
except Exception as e:
self.logger.error(f"Failed to read/encode image {image_path}: {str(e)}")
import json
return json.dumps({
"type": "brief_description",
"title": "Image Error",
"description": f"Failed to read/encode image: {str(e)}"
})
# 构造消息 - 使用base64编码的图片
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"file://{image_path}"}}
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}}
]
}
]
@ -251,8 +293,13 @@ class MaxKBModelClient:
except Exception as e:
self.logger.error(f"Vision completion failed: {str(e)}")
# 返回空字符串而不是抛出异常,让处理继续
return ""
# 返回错误JSON而不是空字符串
import json
return json.dumps({
"type": "brief_description",
"title": "Vision Error",
"description": f"Vision completion failed: {str(e)}"
})
async def batch_chat_completion(self, model_id: str, batch_messages: List[List[Dict]], **kwargs) -> List[str]:
"""

View File

@ -610,6 +610,16 @@ class ParallelMinerUProcessor:
xref = image_info.xref
if xref in classification_results:
result = classification_results[xref]
else:
# No classification result - likely an error occurred
self.logger.warning(f"Recognizer: no classification result for {image_info.filename}, creating default result")
result = {
'type': 'meaningless',
'content': 'Classification failed - no result returned',
'input_tokens': 0,
'output_tokens': 0,
'error': 'No classification result'
}
# Apply meaningless filter if configured
if self.config.filter_meaningless_images and result.get('type') == 'meaningless':
@ -977,25 +987,24 @@ class ParallelMinerUProcessor:
f"images_count={len(task.images)}")
if has_content and (has_no_images or has_processed_images):
# Integrate image descriptions into content before marking complete
if task.processed_images and task.image_descriptions:
self.logger.info(f"Page {task.page_idx + 1} ready for image integration:")
self.logger.info(f" - processed_images: {list(task.processed_images.keys())}")
# Integrate images into content if we have any image descriptions
# This ensures meaningless images are properly removed from content
if task.image_descriptions:
self.logger.info(f"Page {task.page_idx + 1} processing image integration:")
self.logger.info(f" - processed_images: {list(task.processed_images.keys()) if task.processed_images else 'None (filtered out)'}")
self.logger.info(f" - image_descriptions: {list(task.image_descriptions.keys())}")
self.logger.info(f" - content length before: {len(task.refined_content)} chars")
task.refined_content = self._integrate_images_into_content(
task.refined_content,
task.image_descriptions,
task.processed_images,
task.processed_images or {}, # Pass empty dict if None
f"{task.src_fileid}_page_{task.page_idx}"
)
self.logger.info(f" - content length after: {len(task.refined_content)} chars")
else:
self.logger.info(f"Page {task.page_idx + 1} has no images to integrate: "
f"processed_images={bool(task.processed_images)}, "
f"image_descriptions={bool(task.image_descriptions)}")
self.logger.info(f"Page {task.page_idx + 1} has no images to process")
task.status = TaskStatus.COMPLETED
should_mark_complete = True
@ -1032,8 +1041,23 @@ class ParallelMinerUProcessor:
# Process each image description
for filename, desc_info in image_descriptions.items():
self.logger.info(f"\nChecking image {filename} for replacement")
if filename in uploaded_images:
uploaded_url = uploaded_images[filename]
# Get image type to determine if it's meaningless
img_type = desc_info.get('type', 'brief_description')
# Process ALL images that have descriptions
# - Meaningless images: remove references (replace with empty string)
# - Images not uploaded but classified: also remove (likely filtered)
# - Uploaded images: replace with proper markdown
uploaded_url = uploaded_images.get(filename, '')
if img_type == 'meaningless':
self.logger.info(f" - Image is meaningless, will remove references")
elif filename not in uploaded_images:
self.logger.info(f" - Image was classified as {img_type} but not uploaded (filtered), will remove references")
# Treat as meaningless for removal purposes
img_type = 'meaningless'
else:
self.logger.info(f" - Found in uploaded_images: {uploaded_url}")
# Extract the hash part from the filename
@ -1057,15 +1081,15 @@ class ParallelMinerUProcessor:
ref = base_filename
# Build replacement content based on image type
img_type = desc_info.get('type', 'brief_description')
# img_type already extracted above
title = desc_info.get('title', '')
description = desc_info.get('content', '')
ocr_content = desc_info.get('ocr_content', '')
# Create the replacement markdown
if img_type == 'meaningless':
# Skip meaningless images
continue
# For meaningless images, we want to remove them entirely
replacement = "" # Empty string to remove the image reference
elif img_type == 'structured_content':
# For structured content, include full description
replacement = f"\n\n![{title}]({uploaded_url})\n<!--{description}-->\n\n{ocr_content}\n\n"
@ -1120,8 +1144,6 @@ class ParallelMinerUProcessor:
self.logger.info(f"Tried patterns:")
for p in patterns:
self.logger.info(f" - {p}")
else:
self.logger.warning(f"Image {filename} not found in uploaded_images!")
return enhanced_content

View File

@ -56,18 +56,6 @@ IMAGE_CLASSIFICATION_CONTEXT_BASE = IMAGE_CLASSIFICATION_BASE + """
- description字段请控制在100-200字以内需要
- 解释图片与周围文本的关系
- 说明图片在文档中的作用
- context_relevance 字段表示图片与上下文的相关性high/medium/low
# 输出格式:
```json
{
"type": "分类类型",
"title": "简短标题",
"description": "详细描述",
"ocr_content": "提取的文字内容(如适用)",
"context_relevance": "相关性等级"
}
```
# 上下文信息:
"""

View File

@ -72,6 +72,65 @@ def pro():
if not settings.DEBUG:
pro()
# 添加storage路由 - 使用函数视图避免类导入问题
def serve_storage_file(request, file_path):
"""
直接提供storage目录下的文件访问
"""
import os
import mimetypes
from django.http import HttpResponse, Http404
from django.utils.encoding import escape_uri_path
# 基础存储路径 - 支持本地开发和Docker环境
base_path = os.getenv('MAXKB_STORAGE_PATH', '/opt/maxkb/storage')
# 如果是本地开发环境,使用相对路径
if not os.path.exists(base_path):
base_path = './tmp/maxkb/storage'
full_path = os.path.join(base_path, file_path)
# 安全检查
try:
real_base = os.path.realpath(base_path)
real_path = os.path.realpath(full_path)
if not real_path.startswith(real_base):
raise Http404("File not found")
except (OSError, ValueError):
raise Http404("File not found")
# 检查文件是否存在
if not os.path.exists(full_path) or not os.path.isfile(full_path):
raise Http404("File not found")
# 读取文件
try:
with open(full_path, 'rb') as f:
file_content = f.read()
except IOError:
raise Http404("File not found")
# 获取MIME类型
content_type, _ = mimetypes.guess_type(full_path)
if not content_type:
content_type = 'application/octet-stream'
# 构建响应
response = HttpResponse(file_content, content_type=content_type)
# 设置响应头
file_name = os.path.basename(full_path)
if content_type.startswith('image/'):
response['Content-Disposition'] = f'inline; filename="{escape_uri_path(file_name)}"'
response['Cache-Control'] = 'public, max-age=2592000' # 30天缓存
else:
response['Content-Disposition'] = f'attachment; filename="{escape_uri_path(file_name)}"'
response['Cache-Control'] = 'public, max-age=86400' # 1天缓存
return response
# 添加storage路由
urlpatterns.insert(0, re_path(r'^storage/(?P<file_path>.*)$', serve_storage_file, name='storage_file'))
def get_index_html(index_path):
file = open(index_path, "r", encoding='utf-8')

View File

@ -6,4 +6,5 @@ app_name = 'oss'
urlpatterns = [
path('oss/file', views.FileView.as_view()),
# storage路由已移至主URL配置中
]

View File

@ -1 +1,2 @@
from .file import *
from .storage import StorageFileView

87
apps/oss/views/storage.py Normal file
View File

@ -0,0 +1,87 @@
# coding=utf-8
"""
Storage file service for MinerU parsed images
"""
import os
import mimetypes
from pathlib import Path
from django.http import HttpResponse, Http404
from django.utils.encoding import escape_uri_path
from django.views import View
class StorageFileView(View):
"""
静态文件服务视图用于提供MinerU解析后的图片访问
使用Django基础View类完全不涉及认证系统
"""
def get(self, request, file_path: str):
"""
获取存储的文件
Args:
request: HTTP请求
file_path: 文件相对路径mineru/images/xxx.jpg
Returns:
文件内容或404错误
"""
# 基础存储路径(从环境变量读取,默认为/opt/maxkb/storage
base_path = os.getenv('MAXKB_STORAGE_PATH', '/opt/maxkb/storage')
# 构建完整文件路径
full_path = os.path.join(base_path, file_path)
# 安全检查确保请求的路径在base_path内
try:
# 规范化路径,解析符号链接等
real_base = os.path.realpath(base_path)
real_path = os.path.realpath(full_path)
# 确保文件路径在基础路径内(防止路径遍历攻击)
if not real_path.startswith(real_base):
raise Http404("File not found")
except (OSError, ValueError):
raise Http404("File not found")
# 检查文件是否存在
if not os.path.exists(full_path) or not os.path.isfile(full_path):
raise Http404("File not found")
# 读取文件内容
try:
with open(full_path, 'rb') as f:
file_content = f.read()
except IOError:
raise Http404("File not found")
# 获取文件MIME类型
content_type, _ = mimetypes.guess_type(full_path)
if not content_type:
content_type = 'application/octet-stream'
# 构建响应
response = HttpResponse(file_content, content_type=content_type)
# 设置文件名(用于下载)
file_name = os.path.basename(full_path)
# 对于图片类型使用inline显示其他类型使用attachment下载
if content_type.startswith('image/'):
disposition = 'inline'
else:
disposition = 'attachment'
# 使用escape_uri_path处理文件名中的特殊字符
response['Content-Disposition'] = f'{disposition}; filename="{escape_uri_path(file_name)}"'
# 设置缓存控制(图片可以缓存较长时间)
if content_type.startswith('image/'):
# 图片缓存30天
response['Cache-Control'] = 'public, max-age=2592000'
else:
# 其他文件缓存1天
response['Cache-Control'] = 'public, max-age=86400'
return response

View File

@ -15,12 +15,50 @@ services:
- ../tmp:/tmp
# 数据持久化
- ~/.maxkb:/opt/maxkb
# MinerU图片存储目录持久化
- ~/.maxkb/storage:/opt/maxkb/storage:rw
environment:
# 开启调试模式
DJANGO_DEBUG: "True"
PYTHONUNBUFFERED: "1"
MAXKB_LOG_LEVEL: "DEBUG"
# MinerU 配置
MINERU_PARSER_CACHE: "True"
MINERU_MULTIMODAL_REFINEMENT: "True"
# MinerU 图片存储路径
MAXKB_STORAGE_PATH: "/opt/maxkb/storage"
MINERU_API_TYPE: "self_hosted"
MINERU_API_URL: "http://mineru:8000"
# MINERU_API_TYPE: "cloud"
# MINERU_API_URL: "https://mineru.net"
# MINERU_API_KEY: "eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiI2OTYwMDEwNiIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc1NTE2MzQ5OCwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiIiwib3BlbklkIjpudWxsLCJ1dWlkIjoiYTQwODk5NjMtNDI1OS00MWM3LWE3NWItY2IzZTQ4NTRjYWIwIiwiZW1haWwiOiIiLCJleHAiOjE3NTYzNzMwOTh9.88m9JSKQhkwJ557jCTFOgmdjeAfpXzxy2QDINaJ0rfHfnMNBxQt47aHr2jABeuxW-fXm8S5AO7zWWTXEGx8BxA"
# 配置队列大小
MINERU_QUEUE_SIZE: "50"
# 配置处理超时时间(秒)
MINERU_PROCESSING_TIMEOUT: "7200"
# 配置各线程数量(支持多线程)
MINERU_PARSER_THREADS: "1"
MINERU_REFINER_THREADS: "3"
MINERU_RECOGNIZER_THREADS: "3"
MINERU_UPLOADER_THREADS: "1"
MINERU_BATCH_PROCESSING: "true"
MINERU_BATCH_SIZE: "10"
# 启用/禁用过滤
MINERU_SKIP_SMALL_IMAGES: "true"
# 每页最多图片数
MINERU_MAX_IMAGES_PER_PAGE: "10"
# 文档最多图片数
MINERU_MAX_IMAGES_PER_DOCUMENT: "200"
# 最小图片尺寸(像素)
MINERU_MIN_IMAGE_SIZE: "10000"
# 最大图片尺寸(像素)
MINERU_MAX_IMAGE_SIZE: "10000000"
# 过滤无意义图片
MINERU_FILTER_MEANINGLESS: "true"
restart: unless-stopped
# 使用镜像默认的启动命令

View File

@ -75,6 +75,11 @@ ENV MAXKB_VERSION="${DOCKER_IMAGE_TAG} (build at ${BUILD_AT}, commit: ${GITHUB_C
PATH=/opt/py3/bin:$PATH \
PIP_TARGET=/opt/maxkb/python-packages
# Install poppler-utils for PDF processing (required by MinerU)
RUN apt-get update && \
apt-get install -y --no-install-recommends poppler-utils && \
apt-get clean all && \
rm -rf /var/lib/apt/lists/*
WORKDIR /opt/maxkb-app
COPY --from=stage-build /opt/maxkb-app /opt/maxkb-app

59
test_image_access.py Normal file
View File

@ -0,0 +1,59 @@
#!/usr/bin/env python
"""
测试图片存储和访问
这个脚本会
1. 创建一个测试图片在存储目录
2. 打印正确的访问URL
"""
import os
import sys
def main():
# 设置存储路径(本地开发环境)
storage_path = os.getenv('MAXKB_STORAGE_PATH', './tmp/maxkb/storage')
print("=" * 60)
print("MaxKB 图片存储和访问测试")
print("=" * 60)
# 创建目录结构
image_dir = os.path.join(storage_path, 'mineru', 'images')
os.makedirs(image_dir, exist_ok=True)
print(f"\n1. 存储目录:{image_dir}")
# 创建测试图片文件
test_image = os.path.join(image_dir, 'ac3681aaa7a346b49ef9c7ceb7b94058.jpg')
with open(test_image, 'wb') as f:
# 写入一个简单的测试内容(实际应该是图片二进制数据)
f.write(b'TEST IMAGE CONTENT')
print(f"2. 创建测试文件:{test_image}")
# 生成访问URL
print("\n3. 访问URL")
print(f" 本地开发http://localhost:8080/storage/mineru/images/ac3681aaa7a346b49ef9c7ceb7b94058.jpg")
print(f" Docker环境http://localhost:8080/storage/mineru/images/ac3681aaa7a346b49ef9c7ceb7b94058.jpg")
# 列出当前存储目录的所有文件
print(f"\n4. 存储目录内容:")
for root, dirs, files in os.walk(storage_path):
level = root.replace(storage_path, '').count(os.sep)
indent = ' ' * level
print(f'{indent}{os.path.basename(root)}/')
subindent = ' ' * (level + 1)
for file in files:
file_path = os.path.join(root, file)
file_size = os.path.getsize(file_path)
print(f'{subindent}{file} ({file_size} bytes)')
print("\n" + "=" * 60)
print("测试完成!")
print("\n注意事项:")
print("1. 确保Django服务器正在运行")
print("2. URL路径现在是 /storage/ 开头,简洁直接")
print("3. 如果使用Docker确保volume正确挂载")
print("=" * 60)
if __name__ == "__main__":
main()

131
test_storage.py Normal file
View File

@ -0,0 +1,131 @@
#!/usr/bin/env python
"""
测试MinerU图片存储和访问功能
使用方法
1. 在本地开发环境python test_storage.py
2. 在Docker环境docker exec -it maxkb-dev python /opt/maxkb-app/test_storage.py
"""
import os
import sys
import tempfile
import shutil
from pathlib import Path
def test_storage():
"""测试存储功能"""
print("=" * 60)
print("MinerU 图片存储测试")
print("=" * 60)
# 1. 检查存储路径配置
storage_path = os.getenv('MAXKB_STORAGE_PATH', '/opt/maxkb/storage')
print(f"\n1. 存储路径配置:{storage_path}")
# 2. 创建测试目录结构
test_dir = os.path.join(storage_path, 'test', 'images')
print(f"\n2. 创建测试目录:{test_dir}")
os.makedirs(test_dir, exist_ok=True)
# 3. 创建测试图片文件
test_image_path = os.path.join(test_dir, 'test_image.txt')
print(f"\n3. 创建测试文件:{test_image_path}")
with open(test_image_path, 'w') as f:
f.write("This is a test image file for MinerU storage")
# 4. 验证文件创建
if os.path.exists(test_image_path):
print(" ✓ 文件创建成功")
file_size = os.path.getsize(test_image_path)
print(f" 文件大小:{file_size} bytes")
else:
print(" ✗ 文件创建失败")
return False
# 5. 生成访问URL
relative_path = os.path.relpath(test_image_path, storage_path)
access_url = f"/api/storage/{relative_path}"
print(f"\n4. 生成的访问URL{access_url}")
# 6. 列出存储目录内容
print(f"\n5. 存储目录内容:")
for root, dirs, files in os.walk(storage_path):
level = root.replace(storage_path, '').count(os.sep)
indent = ' ' * 2 * level
print(f'{indent}{os.path.basename(root)}/')
subindent = ' ' * 2 * (level + 1)
for file in files:
print(f'{subindent}{file}')
print("\n" + "=" * 60)
print("测试完成!")
print("\n配置建议:")
print("1. 确保Docker volume正确挂载~/.maxkb/storage:/opt/maxkb/storage")
print("2. 确保环境变量设置MAXKB_STORAGE_PATH=/opt/maxkb/storage")
print("3. 访问图片URL格式http://localhost:8080/api/storage/mineru/images/xxx.jpg")
print("=" * 60)
return True
def test_mineru_adapter():
"""测试MinerU适配器"""
print("\n" + "=" * 60)
print("测试MinerU适配器")
print("=" * 60)
# 添加apps目录到Python路径
sys.path.insert(0, '/opt/maxkb-app/apps' if os.path.exists('/opt/maxkb-app/apps') else './apps')
try:
from common.handle.impl.mineru.maxkb_adapter.adapter import MaxKBAdapter
print("\n1. 创建MaxKB适配器实例")
adapter = MaxKBAdapter()
print(f" 存储路径:{adapter.storage_path}")
# 创建临时测试文件
with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp:
tmp.write(b"Test image content")
tmp_path = tmp.name
print(f"\n2. 测试upload_file方法")
print(f" 源文件:{tmp_path}")
# 使用异步方式调用
import asyncio
async def test_upload():
result = await adapter.upload_file(tmp_path, options=['test_knowledge'])
return result
# 运行异步测试
try:
loop = asyncio.get_event_loop()
except RuntimeError:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
result_url = loop.run_until_complete(test_upload())
print(f" 返回URL{result_url}")
# 清理临时文件
os.unlink(tmp_path)
print("\n✓ MinerU适配器测试成功")
except ImportError as e:
print(f"\n✗ 无法导入MinerU适配器{e}")
print(" 请确保在MaxKB环境中运行此测试")
except Exception as e:
print(f"\n✗ 测试失败:{e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
# 运行存储测试
if test_storage():
# 如果基础存储测试成功,尝试测试适配器
try:
test_mineru_adapter()
except:
print("\n提示适配器测试需要在MaxKB环境中运行")

22
test_storage_simple.py Normal file
View File

@ -0,0 +1,22 @@
#!/usr/bin/env python
"""
简单的存储测试 - 创建测试图片
"""
import os
# 创建存储目录
storage_path = './tmp/maxkb/storage/mineru/images'
os.makedirs(storage_path, exist_ok=True)
# 创建测试图片(实际是一个文本文件,但后缀是.jpg
test_file = os.path.join(storage_path, 'ac3681aaa7a346b49ef9c7ceb7b94058.jpg')
with open(test_file, 'wb') as f:
# 写入一个最小的JPEG文件头这样浏览器会识别为图片
# FF D8 FF E0 是JPEG文件的魔术数字
f.write(bytes.fromhex
print(f"测试文件已创建:{test_file}")
print(f"文件大小:{os.path.getsize(test_file)} bytes")
print("\n访问URL")
print("http://localhost:8080/storage/mineru/images/ac3681aaa7a346b49ef9c7ceb7b94058.jpg")
print("\n如果Django服务正在运行可以直接在浏览器中访问上述URL")