365 lines
14 KiB
Python
365 lines
14 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
MinerU Real File Processing Test
|
||
|
||
This script tests actual MinerU file processing with the MaxKB adapter.
|
||
It uses real files and real API calls (no mocking).
|
||
"""
|
||
|
||
import asyncio
|
||
import os
|
||
import sys
|
||
import json
|
||
import time
|
||
from pathlib import Path
|
||
from typing import Optional, Dict, Any
|
||
|
||
# Add the project root to Python path
|
||
project_root = Path(__file__).parent
|
||
sys.path.insert(0, str(project_root))
|
||
|
||
# For MaxKB, also add the apps directory to the path
|
||
apps_path = project_root / 'apps'
|
||
if apps_path.exists():
|
||
sys.path.insert(0, str(apps_path))
|
||
print(f"✅ Added apps directory to Python path: {apps_path}")
|
||
|
||
# Load environment variables from .env file
|
||
try:
|
||
from dotenv import load_dotenv
|
||
env_path = project_root / '.env'
|
||
if env_path.exists():
|
||
load_dotenv(env_path, override=True)
|
||
print(f"✅ Loaded environment variables from {env_path}")
|
||
except ImportError:
|
||
print("ℹ️ python-dotenv not installed. Using system environment variables only.")
|
||
|
||
# Setup Django environment if we're in MaxKB
|
||
try:
|
||
import django
|
||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'maxkb.settings')
|
||
django.setup()
|
||
print("✅ Django environment initialized")
|
||
except ImportError:
|
||
print("ℹ️ Django not available - running in standalone mode")
|
||
except Exception as e:
|
||
print(f"ℹ️ Could not initialize Django: {e}")
|
||
|
||
|
||
class MinerURealTester:
|
||
"""Real MinerU file processing tester"""
|
||
|
||
def __init__(self):
|
||
self.test_file = os.getenv('MINERU_TEST_FILE')
|
||
self.api_url = os.getenv('MINERU_API_URL', 'http://mineru:8000')
|
||
self.api_type = os.getenv('MINERU_API_TYPE', 'self_hosted')
|
||
self.api_key = os.getenv('MINERU_API_KEY')
|
||
|
||
def log(self, message: str, level: str = "INFO"):
|
||
"""Log message with formatting"""
|
||
icons = {"INFO": "ℹ️", "SUCCESS": "✅", "WARNING": "⚠️", "ERROR": "❌", "DEBUG": "🔍"}
|
||
icon = icons.get(level, "📝")
|
||
timestamp = time.strftime("%H:%M:%S")
|
||
print(f"[{timestamp}] {icon} {message}")
|
||
|
||
async def test_real_file_processing(self):
|
||
"""Test real file processing with MinerU"""
|
||
if not self.test_file:
|
||
self.log("No test file specified. Set MINERU_TEST_FILE environment variable.", "ERROR")
|
||
return False
|
||
|
||
if not os.path.exists(self.test_file):
|
||
self.log(f"Test file not found: {self.test_file}", "ERROR")
|
||
return False
|
||
|
||
self.log(f"Testing with file: {self.test_file}", "INFO")
|
||
self.log(f"MinerU API: {self.api_url} ({self.api_type})", "INFO")
|
||
|
||
try:
|
||
# Import MaxKB MinerU adapter
|
||
from common.handle.impl.mineru.maxkb_adapter.adapter import MinerUExtractor
|
||
from common.handle.impl.mineru.maxkb_adapter.config_maxkb import MaxKBMinerUConfig
|
||
|
||
# Create configuration
|
||
config = MaxKBMinerUConfig()
|
||
config.mineru_api_type = self.api_type
|
||
config.mineru_api_url = self.api_url
|
||
config.mineru_api_key = self.api_key
|
||
config.enable_cache = os.getenv('ENABLE_CACHE', 'true').lower() == 'true'
|
||
|
||
self.log("Configuration created", "SUCCESS")
|
||
self.log(f" - API Type: {config.mineru_api_type}", "DEBUG")
|
||
self.log(f" - API URL: {config.mineru_api_url}", "DEBUG")
|
||
self.log(f" - Cache Enabled: {config.enable_cache}", "DEBUG")
|
||
|
||
# Create extractor
|
||
llm_model_id = os.getenv('MINERU_LLM_MODEL_ID')
|
||
vision_model_id = os.getenv('MINERU_VISION_MODEL_ID')
|
||
|
||
extractor = MinerUExtractor(
|
||
llm_model_id=llm_model_id,
|
||
vision_model_id=vision_model_id
|
||
)
|
||
extractor.config = config
|
||
|
||
self.log("MinerU extractor created", "SUCCESS")
|
||
if llm_model_id:
|
||
self.log(f" - LLM Model: {llm_model_id}", "DEBUG")
|
||
if vision_model_id:
|
||
self.log(f" - Vision Model: {vision_model_id}", "DEBUG")
|
||
|
||
# Process the file
|
||
self.log("Starting file processing...", "INFO")
|
||
start_time = time.time()
|
||
|
||
# Call the actual processing method
|
||
result = await extractor.process_file(
|
||
filepath=self.test_file,
|
||
src_name=os.path.basename(self.test_file),
|
||
upload_options=None # Will use test mode for uploads
|
||
)
|
||
|
||
end_time = time.time()
|
||
processing_time = end_time - start_time
|
||
|
||
self.log(f"Processing completed in {processing_time:.2f} seconds", "SUCCESS")
|
||
|
||
# Analyze results
|
||
if result:
|
||
self.log("Processing results:", "INFO")
|
||
|
||
if isinstance(result, list):
|
||
self.log(f" - Document count: {len(result)}", "INFO")
|
||
|
||
total_content_length = 0
|
||
total_images = 0
|
||
|
||
for i, doc in enumerate(result):
|
||
if hasattr(doc, 'page_content'):
|
||
content = doc.page_content
|
||
total_content_length += len(content)
|
||
|
||
# Show content preview
|
||
preview = content[:200].strip() if content else "[EMPTY]"
|
||
if len(content) > 200:
|
||
preview += "..."
|
||
self.log(f" - Document {i+1} preview: {preview}", "DEBUG")
|
||
|
||
if hasattr(doc, 'metadata'):
|
||
metadata = doc.metadata
|
||
if isinstance(metadata, dict):
|
||
# Check for images
|
||
images = metadata.get('images', [])
|
||
if images:
|
||
total_images += len(images)
|
||
self.log(f" - Document {i+1} has {len(images)} images", "INFO")
|
||
|
||
# Check for advanced parser info
|
||
advanced = metadata.get('advanced_parser')
|
||
if advanced:
|
||
try:
|
||
advanced_info = json.loads(advanced) if isinstance(advanced, str) else advanced
|
||
if isinstance(advanced_info, dict):
|
||
self.log(f" - Document {i+1} parser info:", "DEBUG")
|
||
self.log(f" - API Type: {advanced_info.get('api_type', 'unknown')}", "DEBUG")
|
||
self.log(f" - Total Pages: {advanced_info.get('total_pages', 0)}", "DEBUG")
|
||
self.log(f" - Successful Pages: {advanced_info.get('successful_pages', 0)}", "DEBUG")
|
||
except:
|
||
pass
|
||
|
||
self.log(f" - Total content length: {total_content_length} characters", "INFO")
|
||
self.log(f" - Total images found: {total_images}", "INFO")
|
||
|
||
elif isinstance(result, dict):
|
||
self.log(f" - Result type: dictionary", "INFO")
|
||
if 'content' in result:
|
||
content = result['content']
|
||
if isinstance(content, str):
|
||
self.log(f" - Content length: {len(content)} characters", "INFO")
|
||
preview = content[:200].strip() if content else "[EMPTY]"
|
||
if len(content) > 200:
|
||
preview += "..."
|
||
self.log(f" - Content preview: {preview}", "DEBUG")
|
||
elif isinstance(content, list):
|
||
self.log(f" - Content items: {len(content)}", "INFO")
|
||
|
||
if 'images' in result:
|
||
images = result['images']
|
||
self.log(f" - Images found: {len(images)}", "INFO")
|
||
for img_path in images[:5]: # Show first 5 images
|
||
self.log(f" - {img_path}", "DEBUG")
|
||
|
||
else:
|
||
self.log(f" - Result type: {type(result).__name__}", "INFO")
|
||
|
||
self.log("✅ File processing test PASSED", "SUCCESS")
|
||
return True
|
||
|
||
else:
|
||
self.log("No result returned from processing", "ERROR")
|
||
return False
|
||
|
||
except Exception as e:
|
||
self.log(f"File processing failed: {str(e)}", "ERROR")
|
||
import traceback
|
||
self.log(f"Traceback:\n{traceback.format_exc()}", "DEBUG")
|
||
return False
|
||
|
||
async def test_mineru_adapter(self):
|
||
"""Test MinerUAdapter for document processing"""
|
||
if not self.test_file:
|
||
self.log("No test file specified. Set MINERU_TEST_FILE environment variable.", "ERROR")
|
||
return False
|
||
|
||
if not os.path.exists(self.test_file):
|
||
self.log(f"Test file not found: {self.test_file}", "ERROR")
|
||
return False
|
||
|
||
self.log(f"Testing MinerUAdapter with file: {self.test_file}", "INFO")
|
||
|
||
try:
|
||
from common.handle.impl.mineru.maxkb_adapter.adapter import MinerUAdapter
|
||
|
||
# Create adapter
|
||
adapter = MinerUAdapter()
|
||
self.log("MinerUAdapter created", "SUCCESS")
|
||
|
||
# Read file content
|
||
with open(self.test_file, 'rb') as f:
|
||
file_content = f.read()
|
||
|
||
self.log(f"File size: {len(file_content)} bytes", "INFO")
|
||
|
||
# Process document
|
||
self.log("Processing document...", "INFO")
|
||
start_time = time.time()
|
||
|
||
result = adapter.process_document(
|
||
file_content=file_content,
|
||
file_name=os.path.basename(self.test_file),
|
||
save_image_func=None # Test mode - no image saving
|
||
)
|
||
|
||
end_time = time.time()
|
||
processing_time = end_time - start_time
|
||
|
||
self.log(f"Processing completed in {processing_time:.2f} seconds", "SUCCESS")
|
||
|
||
# Analyze results
|
||
if result and 'sections' in result:
|
||
sections = result['sections']
|
||
self.log(f"Sections extracted: {len(sections)}", "INFO")
|
||
|
||
total_content = 0
|
||
total_images = 0
|
||
|
||
for i, section in enumerate(sections[:5]): # Show first 5 sections
|
||
content = section.get('content', '')
|
||
title = section.get('title', '')
|
||
images = section.get('images', [])
|
||
|
||
total_content += len(content)
|
||
total_images += len(images)
|
||
|
||
self.log(f" Section {i+1}:", "INFO")
|
||
if title:
|
||
self.log(f" - Title: {title}", "DEBUG")
|
||
self.log(f" - Content length: {len(content)} chars", "DEBUG")
|
||
if images:
|
||
self.log(f" - Images: {len(images)}", "DEBUG")
|
||
|
||
# Show content preview
|
||
if content:
|
||
preview = content[:100].strip()
|
||
if len(content) > 100:
|
||
preview += "..."
|
||
self.log(f" - Preview: {preview}", "DEBUG")
|
||
|
||
self.log(f"Total content extracted: {total_content} characters", "INFO")
|
||
self.log(f"Total images found: {total_images}", "INFO")
|
||
|
||
self.log("✅ MinerUAdapter test PASSED", "SUCCESS")
|
||
return True
|
||
else:
|
||
self.log("No sections returned from processing", "ERROR")
|
||
return False
|
||
|
||
except Exception as e:
|
||
self.log(f"MinerUAdapter test failed: {str(e)}", "ERROR")
|
||
import traceback
|
||
self.log(f"Traceback:\n{traceback.format_exc()}", "DEBUG")
|
||
return False
|
||
|
||
|
||
async def main():
|
||
"""Main test function"""
|
||
print("\n" + "=" * 60)
|
||
print("🚀 MinerU Real File Processing Test")
|
||
print("=" * 60)
|
||
|
||
# Check if test file is specified
|
||
test_file = os.getenv('MINERU_TEST_FILE')
|
||
if not test_file:
|
||
print("\n❌ Error: MINERU_TEST_FILE environment variable not set")
|
||
print("\nUsage:")
|
||
print(" export MINERU_TEST_FILE=/path/to/your/test.pdf")
|
||
print(" python test_mineru_real.py")
|
||
print("\nOr:")
|
||
print(" MINERU_TEST_FILE=/path/to/your/test.pdf python test_mineru_real.py")
|
||
return 1
|
||
|
||
# Create tester
|
||
tester = MinerURealTester()
|
||
|
||
# Run tests
|
||
print("\n" + "=" * 60)
|
||
print("Test 1: MinerU Extractor (Direct Processing)")
|
||
print("=" * 60)
|
||
|
||
test1_passed = await tester.test_real_file_processing()
|
||
|
||
print("\n" + "=" * 60)
|
||
print("Test 2: MinerU Adapter (MaxKB Integration)")
|
||
print("=" * 60)
|
||
|
||
test2_passed = await tester.test_mineru_adapter()
|
||
|
||
# Summary
|
||
print("\n" + "=" * 60)
|
||
print("📊 Test Summary")
|
||
print("=" * 60)
|
||
|
||
tests = [
|
||
("MinerU Extractor", test1_passed),
|
||
("MinerU Adapter", test2_passed)
|
||
]
|
||
|
||
for test_name, passed in tests:
|
||
status = "✅ PASS" if passed else "❌ FAIL"
|
||
print(f"{status} {test_name}")
|
||
|
||
passed_count = sum(1 for _, p in tests if p)
|
||
total_count = len(tests)
|
||
|
||
print(f"\nTotal: {passed_count}/{total_count} tests passed")
|
||
|
||
if passed_count == total_count:
|
||
print("\n🎉 All tests passed successfully!")
|
||
return 0
|
||
else:
|
||
print(f"\n⚠️ {total_count - passed_count} test(s) failed")
|
||
return 1
|
||
|
||
|
||
if __name__ == "__main__":
|
||
try:
|
||
exit_code = asyncio.run(main())
|
||
sys.exit(exit_code)
|
||
except KeyboardInterrupt:
|
||
print("\n🛑 Test interrupted by user")
|
||
sys.exit(1)
|
||
except Exception as e:
|
||
print(f"❌ Test script error: {e}")
|
||
import traceback
|
||
print(traceback.format_exc())
|
||
sys.exit(1) |