add mineru
This commit is contained in:
parent
35f9a4dbfe
commit
f0263bf189
61
MINERU_STORAGE_README.md
Normal file
61
MINERU_STORAGE_README.md
Normal file
@ -0,0 +1,61 @@
|
||||
# MinerU 图片存储配置说明
|
||||
|
||||
## 访问URL格式
|
||||
|
||||
MinerU解析后的图片访问URL格式为:
|
||||
```
|
||||
http://localhost:8080/storage/mineru/images/xxx.jpg
|
||||
```
|
||||
|
||||
简洁直接,直接使用 `/storage/` 路径访问
|
||||
|
||||
## 存储路径配置
|
||||
|
||||
### 本地开发环境
|
||||
- 存储路径:`./tmp/maxkb/storage/`
|
||||
- 图片位置:`./tmp/maxkb/storage/mineru/images/`
|
||||
|
||||
### Docker环境
|
||||
- 容器内路径:`/opt/maxkb/storage/`
|
||||
- 本地映射路径:`~/.maxkb/storage/`
|
||||
- 图片位置:`~/.maxkb/storage/mineru/images/`
|
||||
|
||||
## 环境变量配置
|
||||
|
||||
在 `.env` 文件或 docker-compose.yml 中添加:
|
||||
```bash
|
||||
MAXKB_STORAGE_PATH=/opt/maxkb/storage
|
||||
```
|
||||
|
||||
## Docker Volume配置
|
||||
|
||||
在 `docker-compose.yml` 中已配置:
|
||||
```yaml
|
||||
volumes:
|
||||
- ~/.maxkb/storage:/opt/maxkb/storage:rw
|
||||
```
|
||||
|
||||
## 测试访问
|
||||
|
||||
1. 运行测试脚本创建测试图片:
|
||||
```bash
|
||||
python test_image_access.py
|
||||
```
|
||||
|
||||
2. 访问测试URL:
|
||||
```
|
||||
http://localhost:8080/storage/mineru/images/ac3681aaa7a346b49ef9c7ceb7b94058.jpg
|
||||
```
|
||||
|
||||
## 故障排查
|
||||
|
||||
1. **404错误**:检查文件是否存在于存储目录
|
||||
2. **权限错误**:确保存储目录有写入权限
|
||||
3. **路径错误**:确认URL路径以 `/storage/` 开头
|
||||
|
||||
## 相关文件
|
||||
|
||||
- 存储视图:`apps/oss/views/storage.py`
|
||||
- URL配置:`apps/oss/urls.py`
|
||||
- MinerU适配器:`apps/common/handle/impl/mineru/maxkb_adapter/adapter.py`
|
||||
- 配置文件:`apps/common/handle/impl/mineru/maxkb_adapter/config_maxkb.py`
|
||||
@ -268,12 +268,35 @@ class BaseMinerUExtractor:
|
||||
self.logger.info(f"mineru-parser: found {len(cached_pages)} cached pages")
|
||||
|
||||
# 获取上传回调(通过适配器)
|
||||
upload_callback = lambda file_path: self.adapter.upload_file(file_path, upload_options)
|
||||
# 总是创建upload_callback,让适配器决定如何处理
|
||||
# upload_func 需要接受4个参数: filepath, filename, upload_options, binary_data
|
||||
async def upload_callback(filepath, filename, options, binary_data=None):
|
||||
# 如果有 binary_data,说明图片被压缩了,需要先保存到临时文件
|
||||
if binary_data:
|
||||
import tempfile
|
||||
with tempfile.NamedTemporaryFile(suffix=os.path.splitext(filename)[1], delete=False) as tmp:
|
||||
tmp.write(binary_data)
|
||||
tmp_path = tmp.name
|
||||
|
||||
result = await self.adapter.upload_file(tmp_path, upload_options)
|
||||
# 清理临时文件
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except:
|
||||
pass
|
||||
elif filepath:
|
||||
result = await self.adapter.upload_file(filepath, upload_options)
|
||||
else:
|
||||
self.logger.warning(f"No file path or binary data provided for {filename}")
|
||||
return None, None
|
||||
|
||||
# 返回 (url, upload_key) 格式
|
||||
return result, None
|
||||
|
||||
# 并行处理文档
|
||||
completed_tasks = await self.parallel_processor.process_document_with_cache(
|
||||
pdf_path, temp_dir, src_fileid, is_ppt_format,
|
||||
len(pages_info), upload_callback if upload_options else None, upload_options,
|
||||
len(pages_info), upload_callback, upload_options,
|
||||
cached_pages=cached_pages,
|
||||
save_callback=lambda idx, data: self._save_page_cache(temp_dir, idx, data)
|
||||
)
|
||||
|
||||
@ -45,6 +45,11 @@ class GPTBaseAdapter(PlatformAdapter):
|
||||
|
||||
async def upload_file(self, file_path: str, options: Any = None) -> str:
|
||||
"""上传文件 - PDF使用S3特殊处理,图片使用gzero上传"""
|
||||
# 如果在测试模式下,直接返回原图地址
|
||||
if os.getenv('MINERU_TEST_FILE'):
|
||||
logger.info(f"Gbase: Test mode - returning original path: {file_path}")
|
||||
return file_path
|
||||
|
||||
import os
|
||||
|
||||
# 判断文件类型
|
||||
|
||||
@ -224,10 +224,10 @@ class ImageOptimizer:
|
||||
upload_options,
|
||||
max_retries: int = 3,
|
||||
retry_delay: float = 1.0) -> Tuple[Optional[str], Optional[str]]:
|
||||
if os.getenv('MINERU_TEST_FILE'):
|
||||
return image_info.filepath, None
|
||||
# return image_info.filepath, None
|
||||
"""上传单个图片(带并发控制和重试机制)"""
|
||||
# 注释掉测试模式,让上传回调能够被调用
|
||||
# if os.getenv('MINERU_TEST_FILE'):
|
||||
# return image_info.filepath, None
|
||||
async with self.upload_semaphore:
|
||||
# 处理图片
|
||||
image_data, hash_value = await self.process_image_for_upload(image_info)
|
||||
@ -344,8 +344,9 @@ class ImageOptimizer:
|
||||
async with self.api_semaphore:
|
||||
try:
|
||||
# 调用分类函数
|
||||
# classify_func expects: (learn_type, image_filepath, temp_dir, src_name, hint)
|
||||
return await classify_func(
|
||||
vision_model,
|
||||
vision_model, # This is actually learn_type
|
||||
img_data['image_info'].filepath,
|
||||
temp_dir,
|
||||
src_name,
|
||||
|
||||
@ -517,11 +517,16 @@ class MinerUImageProcessor:
|
||||
|
||||
This is an enhanced version that uses context when available.
|
||||
"""
|
||||
self.logger.info(f"mineru-image: _classify_single_image_with_context called for {os.path.basename(image_filepath)}")
|
||||
|
||||
# If no context, fall back to original method
|
||||
if not context:
|
||||
self.logger.info(f"mineru-image: no context, falling back to original method")
|
||||
return await self._classify_single_image(learn_type, image_filepath, temp_dir, src_name, hint)
|
||||
|
||||
try:
|
||||
self.logger.info(f"mineru-image: processing with context for {os.path.basename(image_filepath)}")
|
||||
|
||||
if not os.path.exists(image_filepath):
|
||||
raise FileNotFoundError(f"Image file not found: {image_filepath}")
|
||||
|
||||
@ -535,6 +540,9 @@ class MinerUImageProcessor:
|
||||
# Build context-aware prompt with language
|
||||
prompt = self._build_context_aware_prompt(context, language_code)
|
||||
|
||||
# Log the final prompt for debugging
|
||||
self.logger.info(f"mineru-image: Final prompt for {os.path.basename(image_filepath)}:\n{prompt[:1000]}...")
|
||||
|
||||
messages = [
|
||||
{'role': 'system', 'content': prompt},
|
||||
{'role': 'user', 'content': [
|
||||
@ -549,19 +557,34 @@ class MinerUImageProcessor:
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
self.logger.info(f"mineru-image: calling vision model for {os.path.basename(image_filepath)}")
|
||||
response = await self.config.call_litellm(
|
||||
model_type=learn_type,
|
||||
messages=messages,
|
||||
temperature=0.0,
|
||||
timeout=120.0 # Increased timeout to 120 seconds for vision models
|
||||
)
|
||||
self.logger.info(f"mineru-image: received response from vision model")
|
||||
|
||||
duration = time.time() - start_time
|
||||
|
||||
# Log raw response for debugging
|
||||
raw_response = response.choices[0].message.content if response.choices else ""
|
||||
self.logger.info(f"mineru-image: raw AI response (first 500 chars): {raw_response[:500] if raw_response else 'Empty response'}")
|
||||
# Log complete response for debugging
|
||||
self.logger.info(f"mineru-image: FULL AI response for {os.path.basename(image_filepath)}:\n{raw_response}")
|
||||
|
||||
# Log usage info
|
||||
if hasattr(response, 'usage'):
|
||||
self.logger.info(f"mineru-image: usage - prompt_tokens={getattr(response.usage, 'prompt_tokens', 0)}, "
|
||||
f"completion_tokens={getattr(response.usage, 'completion_tokens', 0)}")
|
||||
else:
|
||||
self.logger.warning(f"mineru-image: no usage info in response")
|
||||
|
||||
# Parse enhanced response
|
||||
result = self._parse_context_aware_response(
|
||||
response.choices[0].message.content,
|
||||
response.usage,
|
||||
raw_response,
|
||||
response.usage if hasattr(response, 'usage') else None,
|
||||
duration
|
||||
)
|
||||
|
||||
@ -569,15 +592,21 @@ class MinerUImageProcessor:
|
||||
result['has_context'] = True
|
||||
result['page_idx'] = context.page_idx
|
||||
|
||||
# Log successful classification
|
||||
self.logger.info(f"mineru-image: classified {os.path.basename(image_filepath)} as {result.get('type', 'unknown')} "
|
||||
f"(tokens: in={result.get('input_tokens', 0)}, out={result.get('output_tokens', 0)})")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"mineru-image: classification error: {str(e)}")
|
||||
self.logger.info(f"mineru-image: classification failed for {os.path.basename(image_filepath)}, returning meaningless")
|
||||
result = {
|
||||
'type': 'meaningless',
|
||||
'content': f'Classification error: {str(e)}',
|
||||
'input_tokens': 0,
|
||||
'output_tokens': 0,
|
||||
'dura': time.time() - start_time,
|
||||
'has_context': True
|
||||
'has_context': True,
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
return result
|
||||
@ -652,13 +681,16 @@ class MinerUImageProcessor:
|
||||
# Parse JSON
|
||||
result_json = json.loads(response_content)
|
||||
|
||||
# Log the raw classification response for debugging
|
||||
self.logger.info(f"mineru-image: parsed JSON response: {result_json}")
|
||||
|
||||
# Build result dictionary
|
||||
result = {
|
||||
'type': result_json.get('type', 'meaningless'),
|
||||
'title': result_json.get('title', ''),
|
||||
'content': result_json.get('description', ''),
|
||||
'input_tokens': usage.prompt_tokens if hasattr(usage, 'prompt_tokens') else 0,
|
||||
'output_tokens': usage.completion_tokens if hasattr(usage, 'completion_tokens') else 0,
|
||||
'input_tokens': usage.prompt_tokens if usage and hasattr(usage, 'prompt_tokens') else 0,
|
||||
'output_tokens': usage.completion_tokens if usage and hasattr(usage, 'completion_tokens') else 0,
|
||||
'dura': duration
|
||||
}
|
||||
|
||||
@ -670,13 +702,14 @@ class MinerUImageProcessor:
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"mineru-image: failed to parse context response: {str(e)}")
|
||||
self.logger.debug(f"mineru-image: response that failed to parse: {response_content[:500] if response_content else 'Empty'}")
|
||||
# Return a basic result
|
||||
return {
|
||||
'type': 'brief_description',
|
||||
'title': '',
|
||||
'content': response_content[:200] if response_content else '',
|
||||
'input_tokens': usage.prompt_tokens if hasattr(usage, 'prompt_tokens') else 0,
|
||||
'output_tokens': usage.completion_tokens if hasattr(usage, 'completion_tokens') else 0,
|
||||
'input_tokens': usage.prompt_tokens if usage and hasattr(usage, 'prompt_tokens') else 0,
|
||||
'output_tokens': usage.completion_tokens if usage and hasattr(usage, 'completion_tokens') else 0,
|
||||
'dura': duration
|
||||
}
|
||||
|
||||
@ -736,6 +769,9 @@ class MinerUImageProcessor:
|
||||
# Parse response
|
||||
response_content = response.choices[0].message.content
|
||||
|
||||
# Log complete response for debugging
|
||||
self.logger.info(f"mineru-image: FULL AI response for {os.path.basename(image_filepath)} (no context):\n{response_content}")
|
||||
|
||||
# Extract JSON from markdown code block if present
|
||||
if '```json' in response_content and '```' in response_content:
|
||||
try:
|
||||
@ -797,7 +833,8 @@ class MinerUImageProcessor:
|
||||
'dura': time.time() - start_time,
|
||||
}
|
||||
|
||||
self.logger.info(f"mineru-image: classified {image_filepath} as {result.get('type', 'unknown')}")
|
||||
# Enhanced logging to debug meaningless classification
|
||||
self.logger.info(f"mineru-image: classified {image_filepath} as {result.get('type', 'unknown')} - tokens: in={result.get('input_tokens', 0)}, out={result.get('output_tokens', 0)}, error={result.get('error', 'None')}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@ -83,7 +83,10 @@ def get_module_logger(module_name):
|
||||
pass
|
||||
|
||||
# 默认情况,创建独立的logger
|
||||
return logging.getLogger(module_name)
|
||||
module_logger = logging.getLogger(module_name)
|
||||
# 确保使用正确的日志级别
|
||||
module_logger.setLevel(logging.INFO)
|
||||
return module_logger
|
||||
|
||||
# 为了兼容性,导出默认logger
|
||||
logger = get_module_logger('mineru')
|
||||
|
||||
@ -30,6 +30,11 @@ class MaxKBAdapter(PlatformAdapter):
|
||||
self.file_storage = FileStorageClient()
|
||||
self.model_client = maxkb_model_client
|
||||
|
||||
# 导入配置以获取存储路径
|
||||
from .config_maxkb import MaxKBMinerUConfig
|
||||
self.config = MaxKBMinerUConfig()
|
||||
self.storage_path = self.config.file_storage_path
|
||||
|
||||
@contextlib.asynccontextmanager
|
||||
async def trace_context(self, trace_id: str):
|
||||
"""MaxKB的trace上下文 - 如果没有特殊实现,使用简单的上下文"""
|
||||
@ -51,19 +56,68 @@ class MaxKBAdapter(PlatformAdapter):
|
||||
logger.debug(f"MaxKB: Released lock for {temp_dir}")
|
||||
|
||||
async def upload_file(self, file_path: str, options: Any = None) -> str:
|
||||
"""使用MaxKB的文件存储上传文件"""
|
||||
"""使用MaxKB的文件存储上传文件 - 直接复制文件到存储目录"""
|
||||
import shutil
|
||||
import uuid
|
||||
|
||||
logger.info(f"MaxKB: upload_file called with path={file_path}, options={options}")
|
||||
|
||||
# 如果在测试模式下,直接返回原图地址
|
||||
if os.getenv('MINERU_TEST_FILE'):
|
||||
logger.info(f"MaxKB: Test mode - returning original path: {file_path}")
|
||||
return file_path
|
||||
#if os.getenv('MINERU_TEST_FILE'):
|
||||
# logger.info(f"MaxKB: Test mode - returning original path: {file_path}")
|
||||
# return file_path
|
||||
|
||||
try:
|
||||
# 使用MaxKB的文件存储客户端上传
|
||||
uploaded_url = await self.file_storage.upload_image(file_path)
|
||||
return uploaded_url
|
||||
# 确保文件存在
|
||||
if not os.path.exists(file_path):
|
||||
logger.warning(f"MaxKB: File not found: {file_path}")
|
||||
return file_path
|
||||
|
||||
# 获取knowledge_id(如果在options中提供)
|
||||
knowledge_id = None
|
||||
if options and isinstance(options, (tuple, list)) and len(options) > 0:
|
||||
knowledge_id = options[0]
|
||||
|
||||
# 创建存储目录结构
|
||||
# 使用 knowledge_id 或 'mineru' 作为子目录
|
||||
sub_dir = knowledge_id if knowledge_id else 'mineru'
|
||||
storage_dir = os.path.join(self.storage_path, sub_dir, 'images')
|
||||
|
||||
# 确保存储目录存在
|
||||
os.makedirs(storage_dir, exist_ok=True)
|
||||
|
||||
# 生成唯一的文件名,保留原始扩展名
|
||||
file_ext = os.path.splitext(file_path)[1]
|
||||
file_name = f"{uuid.uuid4().hex}{file_ext}"
|
||||
dest_path = os.path.join(storage_dir, file_name)
|
||||
|
||||
# 复制文件到存储目录
|
||||
shutil.copy2(file_path, dest_path)
|
||||
|
||||
# 返回相对路径或URL格式
|
||||
# 生成相对于storage根目录的路径
|
||||
relative_path = os.path.relpath(dest_path, self.storage_path)
|
||||
# 确保路径使用正斜杠(兼容所有系统)
|
||||
relative_path = relative_path.replace(os.path.sep, '/')
|
||||
|
||||
# 根据环境配置生成完整的URL
|
||||
# 检查是否配置了基础URL
|
||||
base_url = os.getenv('MAXKB_BASE_URL', '')
|
||||
if base_url:
|
||||
# 如果有基础URL,生成完整的URL
|
||||
result_url = f"{base_url.rstrip('/')}/storage/{relative_path}"
|
||||
else:
|
||||
# 生成相对URL,直接使用/storage/路径
|
||||
result_url = f"/storage/{relative_path}"
|
||||
|
||||
logger.info(f"MaxKB: Copied file {file_path} -> {dest_path}")
|
||||
logger.debug(f"MaxKB: Returning URL: {result_url}")
|
||||
|
||||
return result_url
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"MaxKB: Failed to upload file {file_path}: {str(e)}")
|
||||
# 如果上传失败,返回本地路径
|
||||
logger.error(f"MaxKB: Failed to copy file {file_path}: {str(e)}")
|
||||
# 如果复制失败,返回本地路径
|
||||
return file_path
|
||||
|
||||
def get_logger(self):
|
||||
|
||||
@ -28,7 +28,7 @@ class MaxKBMinerUConfig(MinerUConfig):
|
||||
|
||||
# File storage settings
|
||||
self.file_storage_type = os.getenv('MAXKB_STORAGE_TYPE', 'local') # local, s3, oss
|
||||
self.file_storage_path = os.getenv('MAXKB_STORAGE_PATH', '/tmp/maxkb/storage')
|
||||
self.file_storage_path = os.getenv('MAXKB_STORAGE_PATH', '/opt/maxkb/storage')
|
||||
self.file_storage_bucket = os.getenv('MAXKB_STORAGE_BUCKET')
|
||||
|
||||
# Model client settings
|
||||
@ -133,14 +133,23 @@ class MaxKBMinerUConfig(MinerUConfig):
|
||||
|
||||
# Call appropriate method based on content type
|
||||
if has_images:
|
||||
# Extract image and text for vision model
|
||||
# Extract image and combine all text content for vision model
|
||||
image_path = None
|
||||
prompt = ""
|
||||
combined_prompt = ""
|
||||
|
||||
# First, collect system message if exists
|
||||
for msg in messages:
|
||||
if msg.get('role') == 'system':
|
||||
combined_prompt = msg.get('content', '') + "\n\n"
|
||||
break
|
||||
|
||||
# Then extract user message content
|
||||
for msg in messages:
|
||||
if msg.get('role') == 'user':
|
||||
if isinstance(msg.get('content'), list):
|
||||
for content_item in msg['content']:
|
||||
if content_item.get('type') == 'text':
|
||||
prompt = content_item.get('text', '')
|
||||
combined_prompt += content_item.get('text', '')
|
||||
elif content_item.get('type') == 'image_url':
|
||||
image_url = content_item.get('image_url', {})
|
||||
if isinstance(image_url, dict):
|
||||
@ -158,12 +167,14 @@ class MaxKBMinerUConfig(MinerUConfig):
|
||||
image_path = tmp.name
|
||||
else:
|
||||
image_path = url
|
||||
elif isinstance(msg.get('content'), str):
|
||||
combined_prompt += msg.get('content', '')
|
||||
|
||||
if image_path:
|
||||
response_text = await maxkb_model_client.vision_completion(
|
||||
model_id=model_id,
|
||||
image_path=image_path,
|
||||
prompt=prompt,
|
||||
prompt=combined_prompt,
|
||||
**kwargs
|
||||
)
|
||||
else:
|
||||
@ -215,8 +226,14 @@ class MaxKBMinerUConfig(MinerUConfig):
|
||||
'total_tokens': 0
|
||||
})()
|
||||
|
||||
# Return empty response on error to continue processing
|
||||
return MockResponse("")
|
||||
# Return a valid JSON response on error to prevent parsing issues
|
||||
# This will be parsed as a brief_description type
|
||||
error_response = json.dumps({
|
||||
"type": "brief_description",
|
||||
"title": "Error",
|
||||
"description": f"Model call failed: {str(e)}"
|
||||
})
|
||||
return MockResponse(error_response)
|
||||
|
||||
def _get_default_llm_model_id(self) -> str:
|
||||
"""获取默认的LLM模型ID"""
|
||||
|
||||
@ -187,8 +187,13 @@ class MaxKBModelClient:
|
||||
llm_model = await self.get_llm_model(model_id)
|
||||
|
||||
if not llm_model:
|
||||
self.logger.warning(f"No model available for {model_id}, returning empty response")
|
||||
return ""
|
||||
self.logger.warning(f"No model available for {model_id}, returning error JSON")
|
||||
import json
|
||||
return json.dumps({
|
||||
"type": "brief_description",
|
||||
"title": "No Model",
|
||||
"description": "LLM model not available"
|
||||
})
|
||||
|
||||
# 调用模型 - 使用 sync_to_async 包装同步调用
|
||||
response = await sync_to_async(llm_model.invoke)(messages)
|
||||
@ -203,8 +208,13 @@ class MaxKBModelClient:
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Chat completion failed: {str(e)}")
|
||||
# 返回空字符串而不是抛出异常,让处理继续
|
||||
return ""
|
||||
# 返回错误JSON而不是空字符串
|
||||
import json
|
||||
return json.dumps({
|
||||
"type": "brief_description",
|
||||
"title": "Error",
|
||||
"description": f"Chat completion failed: {str(e)}"
|
||||
})
|
||||
|
||||
async def vision_completion(self, model_id: str, image_path: str, prompt: str, **kwargs) -> str:
|
||||
"""
|
||||
@ -224,16 +234,48 @@ class MaxKBModelClient:
|
||||
vision_model = await self.get_vision_model(model_id)
|
||||
|
||||
if not vision_model:
|
||||
self.logger.warning(f"No vision model available for {model_id}, returning empty response")
|
||||
return ""
|
||||
self.logger.warning(f"No vision model available for {model_id}, returning error JSON")
|
||||
# Return a valid JSON response instead of empty string
|
||||
import json
|
||||
return json.dumps({
|
||||
"type": "brief_description",
|
||||
"title": "No Model",
|
||||
"description": "Vision model not available"
|
||||
})
|
||||
|
||||
# 构造消息
|
||||
# 读取图片并转换为base64
|
||||
import base64
|
||||
import os
|
||||
|
||||
if not os.path.exists(image_path):
|
||||
self.logger.error(f"Image file not found: {image_path}")
|
||||
import json
|
||||
return json.dumps({
|
||||
"type": "brief_description",
|
||||
"title": "File Error",
|
||||
"description": f"Image file not found: {image_path}"
|
||||
})
|
||||
|
||||
try:
|
||||
with open(image_path, 'rb') as img_file:
|
||||
image_data = img_file.read()
|
||||
image_base64 = base64.b64encode(image_data).decode('utf-8')
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to read/encode image {image_path}: {str(e)}")
|
||||
import json
|
||||
return json.dumps({
|
||||
"type": "brief_description",
|
||||
"title": "Image Error",
|
||||
"description": f"Failed to read/encode image: {str(e)}"
|
||||
})
|
||||
|
||||
# 构造消息 - 使用base64编码的图片
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{"type": "image_url", "image_url": {"url": f"file://{image_path}"}}
|
||||
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}}
|
||||
]
|
||||
}
|
||||
]
|
||||
@ -251,8 +293,13 @@ class MaxKBModelClient:
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Vision completion failed: {str(e)}")
|
||||
# 返回空字符串而不是抛出异常,让处理继续
|
||||
return ""
|
||||
# 返回错误JSON而不是空字符串
|
||||
import json
|
||||
return json.dumps({
|
||||
"type": "brief_description",
|
||||
"title": "Vision Error",
|
||||
"description": f"Vision completion failed: {str(e)}"
|
||||
})
|
||||
|
||||
async def batch_chat_completion(self, model_id: str, batch_messages: List[List[Dict]], **kwargs) -> List[str]:
|
||||
"""
|
||||
|
||||
@ -610,6 +610,16 @@ class ParallelMinerUProcessor:
|
||||
xref = image_info.xref
|
||||
if xref in classification_results:
|
||||
result = classification_results[xref]
|
||||
else:
|
||||
# No classification result - likely an error occurred
|
||||
self.logger.warning(f"Recognizer: no classification result for {image_info.filename}, creating default result")
|
||||
result = {
|
||||
'type': 'meaningless',
|
||||
'content': 'Classification failed - no result returned',
|
||||
'input_tokens': 0,
|
||||
'output_tokens': 0,
|
||||
'error': 'No classification result'
|
||||
}
|
||||
|
||||
# Apply meaningless filter if configured
|
||||
if self.config.filter_meaningless_images and result.get('type') == 'meaningless':
|
||||
@ -977,25 +987,24 @@ class ParallelMinerUProcessor:
|
||||
f"images_count={len(task.images)}")
|
||||
|
||||
if has_content and (has_no_images or has_processed_images):
|
||||
# Integrate image descriptions into content before marking complete
|
||||
if task.processed_images and task.image_descriptions:
|
||||
self.logger.info(f"Page {task.page_idx + 1} ready for image integration:")
|
||||
self.logger.info(f" - processed_images: {list(task.processed_images.keys())}")
|
||||
# Integrate images into content if we have any image descriptions
|
||||
# This ensures meaningless images are properly removed from content
|
||||
if task.image_descriptions:
|
||||
self.logger.info(f"Page {task.page_idx + 1} processing image integration:")
|
||||
self.logger.info(f" - processed_images: {list(task.processed_images.keys()) if task.processed_images else 'None (filtered out)'}")
|
||||
self.logger.info(f" - image_descriptions: {list(task.image_descriptions.keys())}")
|
||||
self.logger.info(f" - content length before: {len(task.refined_content)} chars")
|
||||
|
||||
task.refined_content = self._integrate_images_into_content(
|
||||
task.refined_content,
|
||||
task.image_descriptions,
|
||||
task.processed_images,
|
||||
task.processed_images or {}, # Pass empty dict if None
|
||||
f"{task.src_fileid}_page_{task.page_idx}"
|
||||
)
|
||||
|
||||
self.logger.info(f" - content length after: {len(task.refined_content)} chars")
|
||||
else:
|
||||
self.logger.info(f"Page {task.page_idx + 1} has no images to integrate: "
|
||||
f"processed_images={bool(task.processed_images)}, "
|
||||
f"image_descriptions={bool(task.image_descriptions)}")
|
||||
self.logger.info(f"Page {task.page_idx + 1} has no images to process")
|
||||
|
||||
task.status = TaskStatus.COMPLETED
|
||||
should_mark_complete = True
|
||||
@ -1032,8 +1041,23 @@ class ParallelMinerUProcessor:
|
||||
# Process each image description
|
||||
for filename, desc_info in image_descriptions.items():
|
||||
self.logger.info(f"\nChecking image {filename} for replacement")
|
||||
if filename in uploaded_images:
|
||||
uploaded_url = uploaded_images[filename]
|
||||
|
||||
# Get image type to determine if it's meaningless
|
||||
img_type = desc_info.get('type', 'brief_description')
|
||||
|
||||
# Process ALL images that have descriptions
|
||||
# - Meaningless images: remove references (replace with empty string)
|
||||
# - Images not uploaded but classified: also remove (likely filtered)
|
||||
# - Uploaded images: replace with proper markdown
|
||||
uploaded_url = uploaded_images.get(filename, '')
|
||||
|
||||
if img_type == 'meaningless':
|
||||
self.logger.info(f" - Image is meaningless, will remove references")
|
||||
elif filename not in uploaded_images:
|
||||
self.logger.info(f" - Image was classified as {img_type} but not uploaded (filtered), will remove references")
|
||||
# Treat as meaningless for removal purposes
|
||||
img_type = 'meaningless'
|
||||
else:
|
||||
self.logger.info(f" - Found in uploaded_images: {uploaded_url}")
|
||||
|
||||
# Extract the hash part from the filename
|
||||
@ -1057,15 +1081,15 @@ class ParallelMinerUProcessor:
|
||||
ref = base_filename
|
||||
|
||||
# Build replacement content based on image type
|
||||
img_type = desc_info.get('type', 'brief_description')
|
||||
# img_type already extracted above
|
||||
title = desc_info.get('title', '')
|
||||
description = desc_info.get('content', '')
|
||||
ocr_content = desc_info.get('ocr_content', '')
|
||||
|
||||
# Create the replacement markdown
|
||||
if img_type == 'meaningless':
|
||||
# Skip meaningless images
|
||||
continue
|
||||
# For meaningless images, we want to remove them entirely
|
||||
replacement = "" # Empty string to remove the image reference
|
||||
elif img_type == 'structured_content':
|
||||
# For structured content, include full description
|
||||
replacement = f"\n\n\n<!--{description}-->\n\n{ocr_content}\n\n"
|
||||
@ -1120,8 +1144,6 @@ class ParallelMinerUProcessor:
|
||||
self.logger.info(f"Tried patterns:")
|
||||
for p in patterns:
|
||||
self.logger.info(f" - {p}")
|
||||
else:
|
||||
self.logger.warning(f"Image {filename} not found in uploaded_images!")
|
||||
|
||||
return enhanced_content
|
||||
|
||||
|
||||
@ -56,18 +56,6 @@ IMAGE_CLASSIFICATION_CONTEXT_BASE = IMAGE_CLASSIFICATION_BASE + """
|
||||
- description字段请控制在100-200字以内,需要:
|
||||
- 解释图片与周围文本的关系
|
||||
- 说明图片在文档中的作用
|
||||
- context_relevance 字段,表示图片与上下文的相关性(high/medium/low)
|
||||
|
||||
# 输出格式:
|
||||
```json
|
||||
{
|
||||
"type": "分类类型",
|
||||
"title": "简短标题",
|
||||
"description": "详细描述",
|
||||
"ocr_content": "提取的文字内容(如适用)",
|
||||
"context_relevance": "相关性等级"
|
||||
}
|
||||
```
|
||||
|
||||
# 上下文信息:
|
||||
"""
|
||||
|
||||
@ -72,6 +72,65 @@ def pro():
|
||||
if not settings.DEBUG:
|
||||
pro()
|
||||
|
||||
# 添加storage路由 - 使用函数视图避免类导入问题
|
||||
def serve_storage_file(request, file_path):
|
||||
"""
|
||||
直接提供storage目录下的文件访问
|
||||
"""
|
||||
import os
|
||||
import mimetypes
|
||||
from django.http import HttpResponse, Http404
|
||||
from django.utils.encoding import escape_uri_path
|
||||
|
||||
# 基础存储路径 - 支持本地开发和Docker环境
|
||||
base_path = os.getenv('MAXKB_STORAGE_PATH', '/opt/maxkb/storage')
|
||||
# 如果是本地开发环境,使用相对路径
|
||||
if not os.path.exists(base_path):
|
||||
base_path = './tmp/maxkb/storage'
|
||||
full_path = os.path.join(base_path, file_path)
|
||||
|
||||
# 安全检查
|
||||
try:
|
||||
real_base = os.path.realpath(base_path)
|
||||
real_path = os.path.realpath(full_path)
|
||||
if not real_path.startswith(real_base):
|
||||
raise Http404("File not found")
|
||||
except (OSError, ValueError):
|
||||
raise Http404("File not found")
|
||||
|
||||
# 检查文件是否存在
|
||||
if not os.path.exists(full_path) or not os.path.isfile(full_path):
|
||||
raise Http404("File not found")
|
||||
|
||||
# 读取文件
|
||||
try:
|
||||
with open(full_path, 'rb') as f:
|
||||
file_content = f.read()
|
||||
except IOError:
|
||||
raise Http404("File not found")
|
||||
|
||||
# 获取MIME类型
|
||||
content_type, _ = mimetypes.guess_type(full_path)
|
||||
if not content_type:
|
||||
content_type = 'application/octet-stream'
|
||||
|
||||
# 构建响应
|
||||
response = HttpResponse(file_content, content_type=content_type)
|
||||
|
||||
# 设置响应头
|
||||
file_name = os.path.basename(full_path)
|
||||
if content_type.startswith('image/'):
|
||||
response['Content-Disposition'] = f'inline; filename="{escape_uri_path(file_name)}"'
|
||||
response['Cache-Control'] = 'public, max-age=2592000' # 30天缓存
|
||||
else:
|
||||
response['Content-Disposition'] = f'attachment; filename="{escape_uri_path(file_name)}"'
|
||||
response['Cache-Control'] = 'public, max-age=86400' # 1天缓存
|
||||
|
||||
return response
|
||||
|
||||
# 添加storage路由
|
||||
urlpatterns.insert(0, re_path(r'^storage/(?P<file_path>.*)$', serve_storage_file, name='storage_file'))
|
||||
|
||||
|
||||
def get_index_html(index_path):
|
||||
file = open(index_path, "r", encoding='utf-8')
|
||||
|
||||
@ -6,4 +6,5 @@ app_name = 'oss'
|
||||
|
||||
urlpatterns = [
|
||||
path('oss/file', views.FileView.as_view()),
|
||||
# storage路由已移至主URL配置中
|
||||
]
|
||||
|
||||
@ -1 +1,2 @@
|
||||
from .file import *
|
||||
from .storage import StorageFileView
|
||||
87
apps/oss/views/storage.py
Normal file
87
apps/oss/views/storage.py
Normal file
@ -0,0 +1,87 @@
|
||||
# coding=utf-8
|
||||
"""
|
||||
Storage file service for MinerU parsed images
|
||||
"""
|
||||
import os
|
||||
import mimetypes
|
||||
from pathlib import Path
|
||||
|
||||
from django.http import HttpResponse, Http404
|
||||
from django.utils.encoding import escape_uri_path
|
||||
from django.views import View
|
||||
|
||||
|
||||
class StorageFileView(View):
|
||||
"""
|
||||
静态文件服务视图,用于提供MinerU解析后的图片访问
|
||||
使用Django基础View类,完全不涉及认证系统
|
||||
"""
|
||||
|
||||
def get(self, request, file_path: str):
|
||||
"""
|
||||
获取存储的文件
|
||||
|
||||
Args:
|
||||
request: HTTP请求
|
||||
file_path: 文件相对路径(如:mineru/images/xxx.jpg)
|
||||
|
||||
Returns:
|
||||
文件内容或404错误
|
||||
"""
|
||||
# 基础存储路径(从环境变量读取,默认为/opt/maxkb/storage)
|
||||
base_path = os.getenv('MAXKB_STORAGE_PATH', '/opt/maxkb/storage')
|
||||
|
||||
# 构建完整文件路径
|
||||
full_path = os.path.join(base_path, file_path)
|
||||
|
||||
# 安全检查:确保请求的路径在base_path内
|
||||
try:
|
||||
# 规范化路径,解析符号链接等
|
||||
real_base = os.path.realpath(base_path)
|
||||
real_path = os.path.realpath(full_path)
|
||||
|
||||
# 确保文件路径在基础路径内(防止路径遍历攻击)
|
||||
if not real_path.startswith(real_base):
|
||||
raise Http404("File not found")
|
||||
except (OSError, ValueError):
|
||||
raise Http404("File not found")
|
||||
|
||||
# 检查文件是否存在
|
||||
if not os.path.exists(full_path) or not os.path.isfile(full_path):
|
||||
raise Http404("File not found")
|
||||
|
||||
# 读取文件内容
|
||||
try:
|
||||
with open(full_path, 'rb') as f:
|
||||
file_content = f.read()
|
||||
except IOError:
|
||||
raise Http404("File not found")
|
||||
|
||||
# 获取文件MIME类型
|
||||
content_type, _ = mimetypes.guess_type(full_path)
|
||||
if not content_type:
|
||||
content_type = 'application/octet-stream'
|
||||
|
||||
# 构建响应
|
||||
response = HttpResponse(file_content, content_type=content_type)
|
||||
|
||||
# 设置文件名(用于下载)
|
||||
file_name = os.path.basename(full_path)
|
||||
# 对于图片类型,使用inline显示;其他类型使用attachment下载
|
||||
if content_type.startswith('image/'):
|
||||
disposition = 'inline'
|
||||
else:
|
||||
disposition = 'attachment'
|
||||
|
||||
# 使用escape_uri_path处理文件名中的特殊字符
|
||||
response['Content-Disposition'] = f'{disposition}; filename="{escape_uri_path(file_name)}"'
|
||||
|
||||
# 设置缓存控制(图片可以缓存较长时间)
|
||||
if content_type.startswith('image/'):
|
||||
# 图片缓存30天
|
||||
response['Cache-Control'] = 'public, max-age=2592000'
|
||||
else:
|
||||
# 其他文件缓存1天
|
||||
response['Cache-Control'] = 'public, max-age=86400'
|
||||
|
||||
return response
|
||||
@ -15,12 +15,50 @@ services:
|
||||
- ../tmp:/tmp
|
||||
# 数据持久化
|
||||
- ~/.maxkb:/opt/maxkb
|
||||
# MinerU图片存储目录持久化
|
||||
- ~/.maxkb/storage:/opt/maxkb/storage:rw
|
||||
environment:
|
||||
# 开启调试模式
|
||||
DJANGO_DEBUG: "True"
|
||||
PYTHONUNBUFFERED: "1"
|
||||
MAXKB_LOG_LEVEL: "DEBUG"
|
||||
|
||||
# MinerU 配置
|
||||
MINERU_PARSER_CACHE: "True"
|
||||
MINERU_MULTIMODAL_REFINEMENT: "True"
|
||||
# MinerU 图片存储路径
|
||||
MAXKB_STORAGE_PATH: "/opt/maxkb/storage"
|
||||
|
||||
MINERU_API_TYPE: "self_hosted"
|
||||
MINERU_API_URL: "http://mineru:8000"
|
||||
|
||||
# MINERU_API_TYPE: "cloud"
|
||||
# MINERU_API_URL: "https://mineru.net"
|
||||
# MINERU_API_KEY: "eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiI2OTYwMDEwNiIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc1NTE2MzQ5OCwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiIiwib3BlbklkIjpudWxsLCJ1dWlkIjoiYTQwODk5NjMtNDI1OS00MWM3LWE3NWItY2IzZTQ4NTRjYWIwIiwiZW1haWwiOiIiLCJleHAiOjE3NTYzNzMwOTh9.88m9JSKQhkwJ557jCTFOgmdjeAfpXzxy2QDINaJ0rfHfnMNBxQt47aHr2jABeuxW-fXm8S5AO7zWWTXEGx8BxA"
|
||||
|
||||
# 配置队列大小
|
||||
MINERU_QUEUE_SIZE: "50"
|
||||
# 配置处理超时时间(秒)
|
||||
MINERU_PROCESSING_TIMEOUT: "7200"
|
||||
# 配置各线程数量(支持多线程)
|
||||
MINERU_PARSER_THREADS: "1"
|
||||
MINERU_REFINER_THREADS: "3"
|
||||
MINERU_RECOGNIZER_THREADS: "3"
|
||||
MINERU_UPLOADER_THREADS: "1"
|
||||
|
||||
MINERU_BATCH_PROCESSING: "true"
|
||||
MINERU_BATCH_SIZE: "10"
|
||||
# 启用/禁用过滤
|
||||
MINERU_SKIP_SMALL_IMAGES: "true"
|
||||
# 每页最多图片数
|
||||
MINERU_MAX_IMAGES_PER_PAGE: "10"
|
||||
# 文档最多图片数
|
||||
MINERU_MAX_IMAGES_PER_DOCUMENT: "200"
|
||||
# 最小图片尺寸(像素)
|
||||
MINERU_MIN_IMAGE_SIZE: "10000"
|
||||
# 最大图片尺寸(像素)
|
||||
MINERU_MAX_IMAGE_SIZE: "10000000"
|
||||
# 过滤无意义图片
|
||||
MINERU_FILTER_MEANINGLESS: "true"
|
||||
restart: unless-stopped
|
||||
# 使用镜像默认的启动命令
|
||||
|
||||
@ -75,6 +75,11 @@ ENV MAXKB_VERSION="${DOCKER_IMAGE_TAG} (build at ${BUILD_AT}, commit: ${GITHUB_C
|
||||
PATH=/opt/py3/bin:$PATH \
|
||||
PIP_TARGET=/opt/maxkb/python-packages
|
||||
|
||||
# Install poppler-utils for PDF processing (required by MinerU)
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends poppler-utils && \
|
||||
apt-get clean all && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /opt/maxkb-app
|
||||
COPY --from=stage-build /opt/maxkb-app /opt/maxkb-app
|
||||
|
||||
59
test_image_access.py
Normal file
59
test_image_access.py
Normal file
@ -0,0 +1,59 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
测试图片存储和访问
|
||||
|
||||
这个脚本会:
|
||||
1. 创建一个测试图片在存储目录
|
||||
2. 打印正确的访问URL
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
def main():
|
||||
# 设置存储路径(本地开发环境)
|
||||
storage_path = os.getenv('MAXKB_STORAGE_PATH', './tmp/maxkb/storage')
|
||||
|
||||
print("=" * 60)
|
||||
print("MaxKB 图片存储和访问测试")
|
||||
print("=" * 60)
|
||||
|
||||
# 创建目录结构
|
||||
image_dir = os.path.join(storage_path, 'mineru', 'images')
|
||||
os.makedirs(image_dir, exist_ok=True)
|
||||
print(f"\n1. 存储目录:{image_dir}")
|
||||
|
||||
# 创建测试图片文件
|
||||
test_image = os.path.join(image_dir, 'ac3681aaa7a346b49ef9c7ceb7b94058.jpg')
|
||||
with open(test_image, 'wb') as f:
|
||||
# 写入一个简单的测试内容(实际应该是图片二进制数据)
|
||||
f.write(b'TEST IMAGE CONTENT')
|
||||
print(f"2. 创建测试文件:{test_image}")
|
||||
|
||||
# 生成访问URL
|
||||
print("\n3. 访问URL:")
|
||||
print(f" 本地开发:http://localhost:8080/storage/mineru/images/ac3681aaa7a346b49ef9c7ceb7b94058.jpg")
|
||||
print(f" Docker环境:http://localhost:8080/storage/mineru/images/ac3681aaa7a346b49ef9c7ceb7b94058.jpg")
|
||||
|
||||
# 列出当前存储目录的所有文件
|
||||
print(f"\n4. 存储目录内容:")
|
||||
for root, dirs, files in os.walk(storage_path):
|
||||
level = root.replace(storage_path, '').count(os.sep)
|
||||
indent = ' ' * level
|
||||
print(f'{indent}{os.path.basename(root)}/')
|
||||
subindent = ' ' * (level + 1)
|
||||
for file in files:
|
||||
file_path = os.path.join(root, file)
|
||||
file_size = os.path.getsize(file_path)
|
||||
print(f'{subindent}{file} ({file_size} bytes)')
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("测试完成!")
|
||||
print("\n注意事项:")
|
||||
print("1. 确保Django服务器正在运行")
|
||||
print("2. URL路径现在是 /storage/ 开头,简洁直接")
|
||||
print("3. 如果使用Docker,确保volume正确挂载")
|
||||
print("=" * 60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
131
test_storage.py
Normal file
131
test_storage.py
Normal file
@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
测试MinerU图片存储和访问功能
|
||||
|
||||
使用方法:
|
||||
1. 在本地开发环境:python test_storage.py
|
||||
2. 在Docker环境:docker exec -it maxkb-dev python /opt/maxkb-app/test_storage.py
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
def test_storage():
|
||||
"""测试存储功能"""
|
||||
print("=" * 60)
|
||||
print("MinerU 图片存储测试")
|
||||
print("=" * 60)
|
||||
|
||||
# 1. 检查存储路径配置
|
||||
storage_path = os.getenv('MAXKB_STORAGE_PATH', '/opt/maxkb/storage')
|
||||
print(f"\n1. 存储路径配置:{storage_path}")
|
||||
|
||||
# 2. 创建测试目录结构
|
||||
test_dir = os.path.join(storage_path, 'test', 'images')
|
||||
print(f"\n2. 创建测试目录:{test_dir}")
|
||||
os.makedirs(test_dir, exist_ok=True)
|
||||
|
||||
# 3. 创建测试图片文件
|
||||
test_image_path = os.path.join(test_dir, 'test_image.txt')
|
||||
print(f"\n3. 创建测试文件:{test_image_path}")
|
||||
with open(test_image_path, 'w') as f:
|
||||
f.write("This is a test image file for MinerU storage")
|
||||
|
||||
# 4. 验证文件创建
|
||||
if os.path.exists(test_image_path):
|
||||
print(" ✓ 文件创建成功")
|
||||
file_size = os.path.getsize(test_image_path)
|
||||
print(f" 文件大小:{file_size} bytes")
|
||||
else:
|
||||
print(" ✗ 文件创建失败")
|
||||
return False
|
||||
|
||||
# 5. 生成访问URL
|
||||
relative_path = os.path.relpath(test_image_path, storage_path)
|
||||
access_url = f"/api/storage/{relative_path}"
|
||||
print(f"\n4. 生成的访问URL:{access_url}")
|
||||
|
||||
# 6. 列出存储目录内容
|
||||
print(f"\n5. 存储目录内容:")
|
||||
for root, dirs, files in os.walk(storage_path):
|
||||
level = root.replace(storage_path, '').count(os.sep)
|
||||
indent = ' ' * 2 * level
|
||||
print(f'{indent}{os.path.basename(root)}/')
|
||||
subindent = ' ' * 2 * (level + 1)
|
||||
for file in files:
|
||||
print(f'{subindent}{file}')
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("测试完成!")
|
||||
print("\n配置建议:")
|
||||
print("1. 确保Docker volume正确挂载:~/.maxkb/storage:/opt/maxkb/storage")
|
||||
print("2. 确保环境变量设置:MAXKB_STORAGE_PATH=/opt/maxkb/storage")
|
||||
print("3. 访问图片URL格式:http://localhost:8080/api/storage/mineru/images/xxx.jpg")
|
||||
print("=" * 60)
|
||||
|
||||
return True
|
||||
|
||||
def test_mineru_adapter():
|
||||
"""测试MinerU适配器"""
|
||||
print("\n" + "=" * 60)
|
||||
print("测试MinerU适配器")
|
||||
print("=" * 60)
|
||||
|
||||
# 添加apps目录到Python路径
|
||||
sys.path.insert(0, '/opt/maxkb-app/apps' if os.path.exists('/opt/maxkb-app/apps') else './apps')
|
||||
|
||||
try:
|
||||
from common.handle.impl.mineru.maxkb_adapter.adapter import MaxKBAdapter
|
||||
|
||||
print("\n1. 创建MaxKB适配器实例")
|
||||
adapter = MaxKBAdapter()
|
||||
print(f" 存储路径:{adapter.storage_path}")
|
||||
|
||||
# 创建临时测试文件
|
||||
with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp:
|
||||
tmp.write(b"Test image content")
|
||||
tmp_path = tmp.name
|
||||
|
||||
print(f"\n2. 测试upload_file方法")
|
||||
print(f" 源文件:{tmp_path}")
|
||||
|
||||
# 使用异步方式调用
|
||||
import asyncio
|
||||
async def test_upload():
|
||||
result = await adapter.upload_file(tmp_path, options=['test_knowledge'])
|
||||
return result
|
||||
|
||||
# 运行异步测试
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
except RuntimeError:
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
result_url = loop.run_until_complete(test_upload())
|
||||
print(f" 返回URL:{result_url}")
|
||||
|
||||
# 清理临时文件
|
||||
os.unlink(tmp_path)
|
||||
|
||||
print("\n✓ MinerU适配器测试成功")
|
||||
|
||||
except ImportError as e:
|
||||
print(f"\n✗ 无法导入MinerU适配器:{e}")
|
||||
print(" 请确保在MaxKB环境中运行此测试")
|
||||
except Exception as e:
|
||||
print(f"\n✗ 测试失败:{e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 运行存储测试
|
||||
if test_storage():
|
||||
# 如果基础存储测试成功,尝试测试适配器
|
||||
try:
|
||||
test_mineru_adapter()
|
||||
except:
|
||||
print("\n提示:适配器测试需要在MaxKB环境中运行")
|
||||
22
test_storage_simple.py
Normal file
22
test_storage_simple.py
Normal file
@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
简单的存储测试 - 创建测试图片
|
||||
"""
|
||||
import os
|
||||
|
||||
# 创建存储目录
|
||||
storage_path = './tmp/maxkb/storage/mineru/images'
|
||||
os.makedirs(storage_path, exist_ok=True)
|
||||
|
||||
# 创建测试图片(实际是一个文本文件,但后缀是.jpg)
|
||||
test_file = os.path.join(storage_path, 'ac3681aaa7a346b49ef9c7ceb7b94058.jpg')
|
||||
with open(test_file, 'wb') as f:
|
||||
# 写入一个最小的JPEG文件头(这样浏览器会识别为图片)
|
||||
# FF D8 FF E0 是JPEG文件的魔术数字
|
||||
f.write(bytes.fromhex
|
||||
|
||||
print(f"测试文件已创建:{test_file}")
|
||||
print(f"文件大小:{os.path.getsize(test_file)} bytes")
|
||||
print("\n访问URL:")
|
||||
print("http://localhost:8080/storage/mineru/images/ac3681aaa7a346b49ef9c7ceb7b94058.jpg")
|
||||
print("\n如果Django服务正在运行,可以直接在浏览器中访问上述URL")
|
||||
Loading…
Reference in New Issue
Block a user