maxkb/test_mineru_real.py

#!/usr/bin/env python3
"""
MinerU Real File Processing Test

This script tests actual MinerU file processing with the MaxKB adapter.
It uses real files and real API calls (no mocking).
"""

import asyncio
import os
import sys
import json
import time
from pathlib import Path
from typing import Optional, Dict, Any

# Add the project root to Python path
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))

# For MaxKB, also add the apps directory to the path
apps_path = project_root / 'apps'
if apps_path.exists():
    sys.path.insert(0, str(apps_path))
    print(f"✅ Added apps directory to Python path: {apps_path}")

# Load environment variables from .env file
try:
    from dotenv import load_dotenv
    env_path = project_root / '.env'
    if env_path.exists():
        load_dotenv(env_path, override=True)
        print(f"✅ Loaded environment variables from {env_path}")
except ImportError:
    print("ℹ️ python-dotenv not installed. Using system environment variables only.")

# Setup Django environment if we're in MaxKB
try:
    import django
    os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'maxkb.settings')
    django.setup()
    print("✅ Django environment initialized")
except ImportError:
    print("ℹ️ Django not available - running in standalone mode")
except Exception as e:
    print(f"ℹ️ Could not initialize Django: {e}")


class MinerURealTester:
    """Real MinerU file processing tester"""

    def __init__(self):
        self.test_file = os.getenv('MINERU_TEST_FILE')
        self.api_url = os.getenv('MINERU_API_URL', 'http://mineru:8000')
        self.api_type = os.getenv('MINERU_API_TYPE', 'self_hosted')
        self.api_key = os.getenv('MINERU_API_KEY')

    def log(self, message: str, level: str = "INFO"):
        """Log message with formatting"""
        icons = {"INFO": "ℹ️", "SUCCESS": "✅", "WARNING": "⚠️", "ERROR": "❌", "DEBUG": "🔍"}
        icon = icons.get(level, "📝")
        timestamp = time.strftime("%H:%M:%S")
        print(f"[{timestamp}] {icon} {message}")

    async def test_real_file_processing(self):
        """Test real file processing with MinerU"""
        if not self.test_file:
            self.log("No test file specified. Set MINERU_TEST_FILE environment variable.", "ERROR")
            return False

        if not os.path.exists(self.test_file):
            self.log(f"Test file not found: {self.test_file}", "ERROR")
            return False

        self.log(f"Testing with file: {self.test_file}", "INFO")
        self.log(f"MinerU API: {self.api_url} ({self.api_type})", "INFO")

        try:
            # Import MaxKB MinerU adapter
            from common.handle.impl.mineru.maxkb_adapter.adapter import MinerUExtractor
            from common.handle.impl.mineru.maxkb_adapter.config_maxkb import MaxKBMinerUConfig

            # Create configuration
            config = MaxKBMinerUConfig()
            config.mineru_api_type = self.api_type
            config.mineru_api_url = self.api_url
            config.mineru_api_key = self.api_key
            config.enable_cache = os.getenv('ENABLE_CACHE', 'true').lower() == 'true'

            self.log("Configuration created", "SUCCESS")
            self.log(f"  - API Type: {config.mineru_api_type}", "DEBUG")
            self.log(f"  - API URL: {config.mineru_api_url}", "DEBUG")
            self.log(f"  - Cache Enabled: {config.enable_cache}", "DEBUG")

            # Create extractor
            llm_model_id = os.getenv('MINERU_LLM_MODEL_ID')
            vision_model_id = os.getenv('MINERU_VISION_MODEL_ID')

            extractor = MinerUExtractor(
                llm_model_id=llm_model_id,
                vision_model_id=vision_model_id
            )
            extractor.config = config

            self.log("MinerU extractor created", "SUCCESS")
            if llm_model_id:
                self.log(f"  - LLM Model: {llm_model_id}", "DEBUG")
            if vision_model_id:
                self.log(f"  - Vision Model: {vision_model_id}", "DEBUG")

            # Process the file
            self.log("Starting file processing...", "INFO")
            start_time = time.time()

            # Call the actual processing method
            result = await extractor.process_file(
                filepath=self.test_file,
                src_name=os.path.basename(self.test_file),
                upload_options=None  # Will use test mode for uploads
            )

            end_time = time.time()
            processing_time = end_time - start_time

            self.log(f"Processing completed in {processing_time:.2f} seconds", "SUCCESS")

            # Analyze results
            if result:
                self.log("Processing results:", "INFO")

                if isinstance(result, list):
                    self.log(f"  - Document count: {len(result)}", "INFO")

                    total_content_length = 0
                    total_images = 0

                    for i, doc in enumerate(result):
                        if hasattr(doc, 'page_content'):
                            content = doc.page_content
                            total_content_length += len(content)

                            # Show content preview
                            preview = content[:200].strip() if content else "[EMPTY]"
                            if len(content) > 200:
                                preview += "..."
                            self.log(f"  - Document {i+1} preview: {preview}", "DEBUG")

                        if hasattr(doc, 'metadata'):
                            metadata = doc.metadata
                            if isinstance(metadata, dict):
                                # Check for images
                                images = metadata.get('images', [])
                                if images:
                                    total_images += len(images)
                                    self.log(f"  - Document {i+1} has {len(images)} images", "INFO")

                                # Check for advanced parser info
                                advanced = metadata.get('advanced_parser')
                                if advanced:
                                    try:
                                        advanced_info = json.loads(advanced) if isinstance(advanced, str) else advanced
                                        if isinstance(advanced_info, dict):
                                            self.log(f"  - Document {i+1} parser info:", "DEBUG")
                                            self.log(f"    - API Type: {advanced_info.get('api_type', 'unknown')}", "DEBUG")
                                            self.log(f"    - Total Pages: {advanced_info.get('total_pages', 0)}", "DEBUG")
                                            self.log(f"    - Successful Pages: {advanced_info.get('successful_pages', 0)}", "DEBUG")
                                    except:
                                        pass

                    self.log(f"  - Total content length: {total_content_length} characters", "INFO")
                    self.log(f"  - Total images found: {total_images}", "INFO")

                elif isinstance(result, dict):
                    self.log(f"  - Result type: dictionary", "INFO")
                    if 'content' in result:
                        content = result['content']
                        if isinstance(content, str):
                            self.log(f"  - Content length: {len(content)} characters", "INFO")
                            preview = content[:200].strip() if content else "[EMPTY]"
                            if len(content) > 200:
                                preview += "..."
                            self.log(f"  - Content preview: {preview}", "DEBUG")
                        elif isinstance(content, list):
                            self.log(f"  - Content items: {len(content)}", "INFO")

                    if 'images' in result:
                        images = result['images']
                        self.log(f"  - Images found: {len(images)}", "INFO")
                        for img_path in images[:5]:  # Show first 5 images
                            self.log(f"    - {img_path}", "DEBUG")

                else:
                    self.log(f"  - Result type: {type(result).__name__}", "INFO")

                self.log("✅ File processing test PASSED", "SUCCESS")
                return True

            else:
                self.log("No result returned from processing", "ERROR")
                return False

        except Exception as e:
            self.log(f"File processing failed: {str(e)}", "ERROR")
            import traceback
            self.log(f"Traceback:\n{traceback.format_exc()}", "DEBUG")
            return False

    async def test_mineru_adapter(self):
        """Test MinerUAdapter for document processing"""
        if not self.test_file:
            self.log("No test file specified. Set MINERU_TEST_FILE environment variable.", "ERROR")
            return False

        if not os.path.exists(self.test_file):
            self.log(f"Test file not found: {self.test_file}", "ERROR")
            return False

        self.log(f"Testing MinerUAdapter with file: {self.test_file}", "INFO")

        try:
            from common.handle.impl.mineru.maxkb_adapter.adapter import MinerUAdapter

            # Create adapter
            adapter = MinerUAdapter()
            self.log("MinerUAdapter created", "SUCCESS")

            # Read file content
            with open(self.test_file, 'rb') as f:
                file_content = f.read()

            self.log(f"File size: {len(file_content)} bytes", "INFO")

            # Process document
            self.log("Processing document...", "INFO")
            start_time = time.time()

            result = adapter.process_document(
                file_content=file_content,
                file_name=os.path.basename(self.test_file),
                save_image_func=None  # Test mode - no image saving
            )

            end_time = time.time()
            processing_time = end_time - start_time

            self.log(f"Processing completed in {processing_time:.2f} seconds", "SUCCESS")

            # Analyze results
            if result and 'sections' in result:
                sections = result['sections']
                self.log(f"Sections extracted: {len(sections)}", "INFO")

                total_content = 0
                total_images = 0

                for i, section in enumerate(sections[:5]):  # Show first 5 sections
                    content = section.get('content', '')
                    title = section.get('title', '')
                    images = section.get('images', [])

                    total_content += len(content)
                    total_images += len(images)

                    self.log(f"  Section {i+1}:", "INFO")
                    if title:
                        self.log(f"    - Title: {title}", "DEBUG")
                    self.log(f"    - Content length: {len(content)} chars", "DEBUG")
                    if images:
                        self.log(f"    - Images: {len(images)}", "DEBUG")

                    # Show content preview
                    if content:
                        preview = content[:100].strip()
                        if len(content) > 100:
                            preview += "..."
                        self.log(f"    - Preview: {preview}", "DEBUG")

                self.log(f"Total content extracted: {total_content} characters", "INFO")
                self.log(f"Total images found: {total_images}", "INFO")

                self.log("✅ MinerUAdapter test PASSED", "SUCCESS")
                return True
            else:
                self.log("No sections returned from processing", "ERROR")
                return False

        except Exception as e:
            self.log(f"MinerUAdapter test failed: {str(e)}", "ERROR")
            import traceback
            self.log(f"Traceback:\n{traceback.format_exc()}", "DEBUG")
            return False


async def main():
    """Main test function"""
    print("\n" + "=" * 60)
    print("🚀 MinerU Real File Processing Test")
    print("=" * 60)

    # Check if test file is specified
    test_file = os.getenv('MINERU_TEST_FILE')
    if not test_file:
        print("\n❌ Error: MINERU_TEST_FILE environment variable not set")
        print("\nUsage:")
        print("  export MINERU_TEST_FILE=/path/to/your/test.pdf")
        print("  python test_mineru_real.py")
        print("\nOr:")
        print("  MINERU_TEST_FILE=/path/to/your/test.pdf python test_mineru_real.py")
        return 1

    # Create tester
    tester = MinerURealTester()

    # Run tests
    print("\n" + "=" * 60)
    print("Test 1: MinerU Extractor (Direct Processing)")
    print("=" * 60)

    test1_passed = await tester.test_real_file_processing()

    print("\n" + "=" * 60)
    print("Test 2: MinerU Adapter (MaxKB Integration)")
    print("=" * 60)

    test2_passed = await tester.test_mineru_adapter()

    # Summary
    print("\n" + "=" * 60)
    print("📊 Test Summary")
    print("=" * 60)

    tests = [
        ("MinerU Extractor", test1_passed),
        ("MinerU Adapter", test2_passed)
    ]

    for test_name, passed in tests:
        status = "✅ PASS" if passed else "❌ FAIL"
        print(f"{status} {test_name}")

    passed_count = sum(1 for _, p in tests if p)
    total_count = len(tests)

    print(f"\nTotal: {passed_count}/{total_count} tests passed")

    if passed_count == total_count:
        print("\n🎉 All tests passed successfully!")
        return 0
    else:
        print(f"\n⚠️ {total_count - passed_count} test(s) failed")
        return 1


if __name__ == "__main__":
    try:
        exit_code = asyncio.run(main())
        sys.exit(exit_code)
    except KeyboardInterrupt:
        print("\n🛑 Test interrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"❌ Test script error: {e}")
        import traceback
        print(traceback.format_exc())
        sys.exit(1)