maxkb/apps/common/handle/impl/mineru/converter.py

"""
File conversion module for MinerU-based parsing.

This module handles file type detection, PPT to PDF conversion,
and related file processing operations.
"""

import os
import asyncio
import subprocess
from pathlib import Path
from typing import Tuple, Optional
from .logger import get_module_logger
logger = get_module_logger('converter')

from .utils import FileConverter, PDFDetector
from .config_base import MinerUConfig


class DocumentConverter:
    """Document conversion handler with PPT support"""

    def __init__(self, config: MinerUConfig):
        self.config = config
        self.logger = logger

    async def handle_file_input(self, file_path: str, temp_dir: str, src_fileid: str) -> Tuple[str, bool]:
        """
        Handle input file processing - detect type and convert if needed.

        Args:
            file_path: Input file path
            temp_dir: Temporary directory for processing
            src_fileid: Source file ID for logging

        Returns:
            Tuple of (pdf_path, is_ppt_source)
        """
        try:
            file_type = FileConverter.detect_file_type(file_path)
            self.logger.info(f"mineru-converter:  detected file type: {file_type}")

            # Validate file size
            if not FileConverter.validate_file_size(file_path, self.config.max_file_size / (1024 * 1024)):
                raise ValueError(f"File size exceeds maximum allowed size of {self.config.max_file_size / (1024 * 1024):.1f}MB")

            if file_type in ['ppt', 'pptx']:
                # PPT file needs conversion
                self.logger.info(f"mineru-converter:  converting PPT to PDF")
                pdf_path = await self._convert_ppt_to_pdf(file_path, temp_dir, src_fileid)
                return pdf_path, True  # is_ppt_source = True

            elif file_type in ['doc', 'docx']:
                # DOC file needs conversion (treat similar to PPT)
                self.logger.info(f"mineru-converter:  converting DOC to PDF")
                pdf_path = await self._convert_doc_to_pdf(file_path, temp_dir, src_fileid)
                return pdf_path, True  # is_ppt_source = True (DOC treated like PPT)

            elif file_type == 'image':
                # Image file needs conversion
                self.logger.info(f"mineru-converter:  converting image to PDF")
                pdf_path = await self._convert_image_to_pdf(file_path, temp_dir, src_fileid)
                return pdf_path, False  # is_ppt_source = False (images are not PPT-like)

            elif file_type == 'pdf':
                # PDF file - copy to temp directory for processing
                self.logger.info(f"mineru-converter:  processing PDF directly")
                pdf_path = os.path.join(temp_dir, f"source.pdf")

                # Copy file to temp directory
                import shutil
                shutil.copy(file_path, pdf_path)

                return pdf_path, False  # is_ppt_source = False

            else:
                raise ValueError(f"Unsupported file type: {file_type}")

        except Exception as e:
            self.logger.error(f"mineru-converter:  file input handling failed: {str(e)}")
            raise

    async def _convert_ppt_to_pdf(self, ppt_path: str, temp_dir: str, src_fileid: str) -> str:
        """
        Convert PPT file to PDF using LibreOffice or alternative method.

        Args:
            ppt_path: Path to PPT file
            temp_dir: Temporary directory
            src_fileid: Source file ID for logging

        Returns:
            Path to converted PDF file
        """
        try:
            pdf_path = os.path.join(temp_dir, "source.pdf")

            # Try LibreOffice conversion first
            success = await self._try_libreoffice_conversion(ppt_path, temp_dir, src_fileid)

            if success:
                # LibreOffice generates file with original name
                generated_pdf = os.path.join(temp_dir, Path(ppt_path).stem + ".pdf")
                if os.path.exists(generated_pdf):
                    if generated_pdf != pdf_path:
                        os.rename(generated_pdf, pdf_path)
                elif os.path.exists(pdf_path):
                    pass  # File already exists with correct name
                else:
                    raise Exception("PDF file was not generated by LibreOffice")
            else:
                # Fallback to alternative method
                self.logger.warning(f"mineru-converter:  LibreOffice failed, trying alternative conversion")
                pdf_path = await self._try_alternative_conversion(ppt_path, temp_dir, src_fileid)

            if not os.path.exists(pdf_path):
                raise Exception("PDF conversion failed - no output file generated")

            self.logger.info(f"mineru-converter:  PPT to PDF conversion successful: {pdf_path}")
            return pdf_path

        except Exception as e:
            self.logger.error(f"mineru-converter:  PPT conversion error: {str(e)}")
            raise Exception(f"PPT to PDF conversion failed: {str(e)}")

    async def _try_libreoffice_conversion(self, ppt_path: str, temp_dir: str, src_fileid: str) -> bool:
        """
        Try converting PPT using LibreOffice.

        Returns:
            True if conversion successful, False otherwise
        """
        try:
            cmd = [
                self.config.libreoffice_path,
                '--headless',
                '--convert-to', 'pdf',
                '--outdir', temp_dir,
                ppt_path
            ]

            self.logger.info(f"mineru-converter:  executing LibreOffice conversion")

            # Run with timeout
            process = await asyncio.create_subprocess_exec(
                *cmd,
                stdout=asyncio.subprocess.PIPE,
                stderr=asyncio.subprocess.PIPE
            )

            try:
                stdout, stderr = await asyncio.wait_for(
                    process.communicate(),
                    timeout=self.config.conversion_timeout
                )

                if process.returncode != 0:
                    self.logger.warning(f"mineru-converter:  LibreOffice returned non-zero: {stderr.decode()}")
                    return False

                self.logger.info(f"mineru-converter:  LibreOffice conversion completed successfully")
                return True

            except asyncio.TimeoutError:
                self.logger.warning(f"mineru-converter:  LibreOffice conversion timeout")
                process.kill()
                return False

        except FileNotFoundError:
            self.logger.warning(f"mineru-converter:  LibreOffice not found at: {self.config.libreoffice_path}")
            return False
        except Exception as e:
            self.logger.warning(f"mineru-converter:  LibreOffice conversion error: {str(e)}")
            return False

    async def _try_alternative_conversion(self, ppt_path: str, temp_dir: str, src_fileid: str) -> str:
        """
        Try alternative PPT conversion method using python libraries.

        Returns:
            Path to converted PDF file
        """
        try:
            from pptx import Presentation
            from reportlab.pdfgen import canvas
            from reportlab.lib.pagesizes import A4

            pdf_path = os.path.join(temp_dir, "source.pdf")

            self.logger.info(f"mineru-converter:  using python-pptx + reportlab conversion")

            # Load PPT presentation
            prs = Presentation(ppt_path)

            # Create PDF
            c = canvas.Canvas(pdf_path, pagesize=A4)
            page_width, page_height = A4

            for slide_idx, slide in enumerate(prs.slides):
                if slide_idx > 0:
                    c.showPage()

                # Add slide content
                y_position = page_height - 50  # Start from top
                margin = 50

                # Extract text content from slide
                slide_texts = []
                for shape in slide.shapes:
                    if hasattr(shape, "text") and shape.text.strip():
                        slide_texts.append(shape.text.strip())

                # Write text to PDF
                for text in slide_texts:
                    if y_position < margin:
                        break  # Avoid writing below page margin

                    # Split long text into lines
                    words = text.split()
                    line = ""
                    for word in words:
                        test_line = f"{line} {word}".strip()
                        if len(test_line) > 80:  # Approximate character limit per line
                            if line:
                                c.drawString(margin, y_position, line)
                                y_position -= 15
                                if y_position < margin:
                                    break
                            line = word
                        else:
                            line = test_line

                    if line and y_position >= margin:
                        c.drawString(margin, y_position, line)
                        y_position -= 20  # Extra space between text blocks

                # Add slide number
                c.drawString(page_width - 100, 30, f"Slide {slide_idx + 1}")

            c.save()

            self.logger.info(f"mineru-converter:  alternative conversion completed: {pdf_path}")
            return pdf_path

        except ImportError as e:
            raise Exception(f"Alternative conversion requires python-pptx and reportlab: {str(e)}")
        except Exception as e:
            raise Exception(f"Alternative PPT conversion failed: {str(e)}")

    async def _convert_doc_to_pdf(self, doc_path: str, temp_dir: str, src_fileid: str) -> str:
        """
        Convert DOC file to PDF using LibreOffice or alternative method.

        Args:
            doc_path: Path to DOC file
            temp_dir: Temporary directory
            src_fileid: Source file ID for logging

        Returns:
            Path to converted PDF file
        """
        try:
            pdf_path = os.path.join(temp_dir, "source.pdf")

            # Try LibreOffice conversion first
            success = await self._try_libreoffice_conversion(doc_path, temp_dir, src_fileid)

            if success:
                # LibreOffice generates file with original name
                generated_pdf = os.path.join(temp_dir, Path(doc_path).stem + ".pdf")
                if os.path.exists(generated_pdf):
                    if generated_pdf != pdf_path:
                        os.rename(generated_pdf, pdf_path)
                elif os.path.exists(pdf_path):
                    pass  # File already exists with correct name
                else:
                    raise Exception("PDF file was not generated by LibreOffice")
            else:
                # Fallback to alternative method
                self.logger.warning(f"mineru-converter:  LibreOffice failed, using FileConverter for DOC")
                pdf_path = await FileConverter.convert_doc_to_pdf(doc_path, temp_dir)

            if not os.path.exists(pdf_path):
                raise Exception("PDF conversion failed - no output file generated")

            self.logger.info(f"mineru-converter:  DOC to PDF conversion successful: {pdf_path}")
            return pdf_path

        except Exception as e:
            self.logger.error(f"mineru-converter:  DOC conversion error: {str(e)}")
            raise Exception(f"DOC to PDF conversion failed: {str(e)}")

    async def _convert_image_to_pdf(self, image_path: str, temp_dir: str, src_fileid: str) -> str:
        """
        Convert image file to PDF.

        Args:
            image_path: Path to image file
            temp_dir: Temporary directory
            src_fileid: Source file ID for logging

        Returns:
            Path to converted PDF file
        """
        try:
            pdf_path = os.path.join(temp_dir, "source.pdf")

            # Use FileConverter utility
            converted_path = await FileConverter.convert_image_to_pdf(image_path, temp_dir)

            # Rename to standard name if needed
            if converted_path != pdf_path and os.path.exists(converted_path):
                os.rename(converted_path, pdf_path)

            if not os.path.exists(pdf_path):
                raise Exception("PDF conversion failed - no output file generated")

            self.logger.info(f"mineru-converter:  image to PDF conversion successful: {pdf_path}")
            return pdf_path

        except Exception as e:
            self.logger.error(f"mineru-converter:  image conversion error: {str(e)}")
            raise Exception(f"Image to PDF conversion failed: {str(e)}")

    def detect_pdf_format(self, pdf_path: str, src_fileid: str) -> Tuple[bool, dict]:
        """
        Detect if PDF was converted from PowerPoint.

        Args:
            pdf_path: Path to PDF file
            src_fileid: Source file ID for logging

        Returns:
            Tuple of (is_ppt_format, metadata)
        """
        try:
            is_ppt, metadata = PDFDetector.is_pdf_from_ppt(pdf_path)

            self.logger.info(f"mineru-converter:  PDF format detection: is_ppt={is_ppt}, metadata={metadata}")

            return is_ppt, metadata

        except Exception as e:
            self.logger.error(f"mineru-converter:  PDF format detection failed: {str(e)}")
            return False, {}

    def extract_pdf_pages(self, pdf_path: str, src_fileid: str) -> list:
        """
        Extract page information from PDF.

        Args:
            pdf_path: Path to PDF file
            src_fileid: Source file ID for logging

        Returns:
            List of page information
        """
        try:
            pages = PDFDetector.extract_pdf_pages(pdf_path)

            self.logger.info(f"mineru-converter:  extracted {len(pages)} pages from PDF")

            return pages

        except Exception as e:
            self.logger.error(f"mineru-converter:  page extraction failed: {str(e)}")
            return []