From 36da5e1bf3367a5cbb849ee6fb51ec4d2791c485 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=B1=E6=BD=AE?= Date: Mon, 25 Aug 2025 19:39:23 +0800 Subject: [PATCH] docker platform adapter --- apps/common/handle/impl/mineru/api_client.py | 3 + .../impl/mineru/maxkb_adapter/adapter.py | 2 +- .../handle/impl/mineru/parallel_processor.py | 2 +- test_url_fix.py | 121 ++++++++++++++++++ test_url_simple.py | 94 ++++++++++++++ 5 files changed, 220 insertions(+), 2 deletions(-) create mode 100644 test_url_fix.py create mode 100644 test_url_simple.py diff --git a/apps/common/handle/impl/mineru/api_client.py b/apps/common/handle/impl/mineru/api_client.py index 2b661809..55fbbea6 100644 --- a/apps/common/handle/impl/mineru/api_client.py +++ b/apps/common/handle/impl/mineru/api_client.py @@ -703,6 +703,9 @@ class MinerUAPIClient: # Step 1: Upload file to accessible URL file_url = await self._upload_file_to_accessible_url(pdf_path, src_fileid) + self.logger.info(f"mineru-api: uploaded file URL: {file_url}") + if not file_url.startswith(('http://', 'https://')): + self.logger.warning(f"mineru-api: URL may not be valid for Cloud API: {file_url}") # Step 2: Create task for full document task_id = await self._create_mineru_task_full_document(file_url, src_fileid) diff --git a/apps/common/handle/impl/mineru/maxkb_adapter/adapter.py b/apps/common/handle/impl/mineru/maxkb_adapter/adapter.py index 1063a7fa..7b97af18 100644 --- a/apps/common/handle/impl/mineru/maxkb_adapter/adapter.py +++ b/apps/common/handle/impl/mineru/maxkb_adapter/adapter.py @@ -111,7 +111,7 @@ class MaxKBAdapter(PlatformAdapter): result_url = f"/storage/{relative_path}" logger.info(f"MaxKB: Copied file {file_path} -> {dest_path}") - logger.debug(f"MaxKB: Returning URL: {result_url}") + logger.info(f"MaxKB: Returning URL: {result_url}") return result_url diff --git a/apps/common/handle/impl/mineru/parallel_processor.py b/apps/common/handle/impl/mineru/parallel_processor.py index 74b7d1e5..425de7e2 100644 --- a/apps/common/handle/impl/mineru/parallel_processor.py +++ b/apps/common/handle/impl/mineru/parallel_processor.py @@ -178,7 +178,7 @@ class ParallelMinerUProcessor: document_batch_info = {} # {src_fileid: {'batch_size': int, 'total_pages': int}} # Initialize API client - async with MinerUAPIClient(self.config) as api_client: + async with MinerUAPIClient(self.config, self.platform_adapter) as api_client: while not self.shutdown_event.is_set(): try: # Get task from queue (timeout to check shutdown) diff --git a/test_url_fix.py b/test_url_fix.py new file mode 100644 index 00000000..0833b3f5 --- /dev/null +++ b/test_url_fix.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +""" +测试URL修复 - 验证platform_adapter是否正确传递 +""" + +import os +import sys +import asyncio +from pathlib import Path + +# Add paths +project_root = Path(__file__).parent +sys.path.insert(0, str(project_root)) +apps_path = project_root / 'apps' +if apps_path.exists(): + sys.path.insert(0, str(apps_path)) + +# Set environment variables for testing +os.environ['MAXKB_BASE_URL'] = 'http://xbase.aitravelmaster.com' +os.environ['MINERU_API_TYPE'] = 'cloud' # Force cloud mode for testing + +async def test_url_generation(): + """Test that URLs are generated correctly""" + + # Import after setting environment + from apps.common.handle.impl.mineru.maxkb_adapter.adapter import MaxKBAdapter + + # Create adapter + adapter = MaxKBAdapter() + + # Create a test file + import tempfile + with tempfile.NamedTemporaryFile(mode='w', suffix='.pdf', delete=False) as f: + f.write('test') + test_file = f.name + + try: + # Test upload_file + print("Testing MaxKBAdapter.upload_file()...") + url = await adapter.upload_file(test_file, ['test_knowledge_id']) + + print(f"\n✅ Generated URL: {url}") + + # Verify URL format + if url.startswith('http://') or url.startswith('https://'): + print("✅ URL is properly formatted for Cloud API") + else: + print(f"❌ URL is not valid for Cloud API: {url}") + + # Check if MAXKB_BASE_URL is used + base_url = os.environ.get('MAXKB_BASE_URL', '') + if base_url and url.startswith(base_url): + print(f"✅ URL correctly uses MAXKB_BASE_URL: {base_url}") + else: + print(f"❌ URL does not use MAXKB_BASE_URL") + + finally: + # Clean up + if os.path.exists(test_file): + os.unlink(test_file) + +async def test_api_client_with_adapter(): + """Test that MinerUAPIClient receives platform_adapter correctly""" + + from apps.common.handle.impl.mineru.api_client import MinerUAPIClient + from apps.common.handle.impl.mineru.maxkb_adapter.adapter import MaxKBAdapter + from apps.common.handle.impl.mineru.maxkb_adapter.config_maxkb import MaxKBMinerUConfig + + print("\nTesting MinerUAPIClient with platform_adapter...") + + # Create components + adapter = MaxKBAdapter() + config = MaxKBMinerUConfig() + + # Create API client with adapter + api_client = MinerUAPIClient(config, adapter) + + # Check if adapter is set + if api_client.platform_adapter is not None: + print("✅ platform_adapter is correctly set in MinerUAPIClient") + else: + print("❌ platform_adapter is None in MinerUAPIClient") + + # Test _upload_file_to_accessible_url + import tempfile + with tempfile.NamedTemporaryFile(mode='w', suffix='.pdf', delete=False) as f: + f.write('test') + test_file = f.name + + try: + # Test upload through API client + async with api_client: + url = await api_client._upload_file_to_accessible_url(test_file, 'test_src_id') + print(f"✅ URL from _upload_file_to_accessible_url: {url}") + + if url.startswith('http://') or url.startswith('https://'): + print("✅ API client generates valid URL for Cloud API") + else: + print(f"❌ API client generates invalid URL: {url}") + + finally: + if os.path.exists(test_file): + os.unlink(test_file) + +if __name__ == "__main__": + print("=" * 60) + print("Testing MinerU Cloud API URL Fix") + print("=" * 60) + + # Check environment + print("\nEnvironment:") + print(f"MAXKB_BASE_URL: {os.environ.get('MAXKB_BASE_URL', 'NOT SET')}") + print(f"MINERU_API_TYPE: {os.environ.get('MINERU_API_TYPE', 'NOT SET')}") + + # Run tests + asyncio.run(test_url_generation()) + asyncio.run(test_api_client_with_adapter()) + + print("\n" + "=" * 60) + print("Test completed!") + print("=" * 60) \ No newline at end of file diff --git a/test_url_simple.py b/test_url_simple.py new file mode 100644 index 00000000..dfab4242 --- /dev/null +++ b/test_url_simple.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +""" +简单测试URL生成逻辑 +""" + +import os +import tempfile +import shutil +import uuid + +# 设置环境变量 +os.environ['MAXKB_BASE_URL'] = 'http://xbase.aitravelmaster.com' + +def test_url_generation(): + """模拟adapter.py中的upload_file逻辑""" + + # 创建测试文件 + with tempfile.NamedTemporaryFile(mode='w', suffix='.pdf', delete=False) as f: + f.write('test') + file_path = f.name + + try: + # 模拟upload_file的逻辑 + storage_path = '/tmp/storage' # 模拟存储路径 + + # 创建存储目录 + sub_dir = 'mineru' + storage_dir = os.path.join(storage_path, sub_dir, 'images') + os.makedirs(storage_dir, exist_ok=True) + + # 生成文件名 + file_ext = os.path.splitext(file_path)[1] + file_name = f"{uuid.uuid4().hex}{file_ext}" + dest_path = os.path.join(storage_dir, file_name) + + # 复制文件 + shutil.copy2(file_path, dest_path) + + # 生成URL(这是关键部分) + relative_path = os.path.relpath(dest_path, storage_path) + relative_path = relative_path.replace(os.path.sep, '/') + + # 检查环境变量 + base_url = os.getenv('MAXKB_BASE_URL', '') + print(f"MAXKB_BASE_URL from env: '{base_url}'") + print(f"Relative path: {relative_path}") + + if base_url: + result_url = f"{base_url.rstrip('/')}/storage/{relative_path}" + print(f"✅ Generated full URL: {result_url}") + else: + result_url = f"/storage/{relative_path}" + print(f"⚠️ Generated relative URL: {result_url}") + + # 验证URL格式 + if result_url.startswith(('http://', 'https://')): + print("✅ URL is valid for Cloud API") + else: + print("❌ URL is NOT valid for Cloud API (must start with http:// or https://)") + + return result_url + + finally: + # 清理 + if os.path.exists(file_path): + os.unlink(file_path) + # 清理存储目录 + if os.path.exists('/tmp/storage'): + shutil.rmtree('/tmp/storage') + +if __name__ == "__main__": + print("=" * 60) + print("Testing URL Generation Logic") + print("=" * 60) + print() + + # 测试1:有MAXKB_BASE_URL + print("Test 1: With MAXKB_BASE_URL set") + print("-" * 40) + url1 = test_url_generation() + + print("\n" + "=" * 60) + + # 测试2:没有MAXKB_BASE_URL + print("\nTest 2: Without MAXKB_BASE_URL") + print("-" * 40) + os.environ['MAXKB_BASE_URL'] = '' + url2 = test_url_generation() + + print("\n" + "=" * 60) + print("Summary:") + print(f"With MAXKB_BASE_URL: {url1}") + print(f"Without MAXKB_BASE_URL: {url2}") + print("=" * 60) \ No newline at end of file