""" File conversion module for MinerU-based parsing. This module handles file type detection, PPT to PDF conversion, and related file processing operations. """ import os import asyncio import subprocess from pathlib import Path from typing import Tuple, Optional from .logger import get_module_logger logger = get_module_logger('converter') from .utils import FileConverter, PDFDetector from .config_base import MinerUConfig class DocumentConverter: """Document conversion handler with PPT support""" def __init__(self, config: MinerUConfig): self.config = config self.logger = logger async def handle_file_input(self, file_path: str, temp_dir: str, src_fileid: str) -> Tuple[str, bool]: """ Handle input file processing - detect type and convert if needed. Args: file_path: Input file path temp_dir: Temporary directory for processing src_fileid: Source file ID for logging Returns: Tuple of (pdf_path, is_ppt_source) """ try: file_type = FileConverter.detect_file_type(file_path) self.logger.info(f"mineru-converter: detected file type: {file_type}") # Validate file size if not FileConverter.validate_file_size(file_path, self.config.max_file_size / (1024 * 1024)): raise ValueError(f"File size exceeds maximum allowed size of {self.config.max_file_size / (1024 * 1024):.1f}MB") if file_type in ['ppt', 'pptx']: # PPT file needs conversion self.logger.info(f"mineru-converter: converting PPT to PDF") pdf_path = await self._convert_ppt_to_pdf(file_path, temp_dir, src_fileid) return pdf_path, True # is_ppt_source = True elif file_type in ['doc', 'docx']: # DOC file needs conversion (treat similar to PPT) self.logger.info(f"mineru-converter: converting DOC to PDF") pdf_path = await self._convert_doc_to_pdf(file_path, temp_dir, src_fileid) return pdf_path, True # is_ppt_source = True (DOC treated like PPT) elif file_type == 'image': # Image file needs conversion self.logger.info(f"mineru-converter: converting image to PDF") pdf_path = await self._convert_image_to_pdf(file_path, temp_dir, src_fileid) return pdf_path, False # is_ppt_source = False (images are not PPT-like) elif file_type == 'pdf': # PDF file - copy to temp directory for processing self.logger.info(f"mineru-converter: processing PDF directly") pdf_path = os.path.join(temp_dir, f"source.pdf") # Copy file to temp directory import shutil shutil.copy(file_path, pdf_path) return pdf_path, False # is_ppt_source = False else: raise ValueError(f"Unsupported file type: {file_type}") except Exception as e: self.logger.error(f"mineru-converter: file input handling failed: {str(e)}") raise async def _convert_ppt_to_pdf(self, ppt_path: str, temp_dir: str, src_fileid: str) -> str: """ Convert PPT file to PDF using LibreOffice or alternative method. Args: ppt_path: Path to PPT file temp_dir: Temporary directory src_fileid: Source file ID for logging Returns: Path to converted PDF file """ try: pdf_path = os.path.join(temp_dir, "source.pdf") # Try LibreOffice conversion first success = await self._try_libreoffice_conversion(ppt_path, temp_dir, src_fileid) if success: # LibreOffice generates file with original name generated_pdf = os.path.join(temp_dir, Path(ppt_path).stem + ".pdf") if os.path.exists(generated_pdf): if generated_pdf != pdf_path: os.rename(generated_pdf, pdf_path) elif os.path.exists(pdf_path): pass # File already exists with correct name else: raise Exception("PDF file was not generated by LibreOffice") else: # Fallback to alternative method self.logger.warning(f"mineru-converter: LibreOffice failed, trying alternative conversion") pdf_path = await self._try_alternative_conversion(ppt_path, temp_dir, src_fileid) if not os.path.exists(pdf_path): raise Exception("PDF conversion failed - no output file generated") self.logger.info(f"mineru-converter: PPT to PDF conversion successful: {pdf_path}") return pdf_path except Exception as e: self.logger.error(f"mineru-converter: PPT conversion error: {str(e)}") raise Exception(f"PPT to PDF conversion failed: {str(e)}") async def _try_libreoffice_conversion(self, ppt_path: str, temp_dir: str, src_fileid: str) -> bool: """ Try converting PPT using LibreOffice. Returns: True if conversion successful, False otherwise """ try: cmd = [ self.config.libreoffice_path, '--headless', '--convert-to', 'pdf', '--outdir', temp_dir, ppt_path ] self.logger.info(f"mineru-converter: executing LibreOffice conversion") # Run with timeout process = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE ) try: stdout, stderr = await asyncio.wait_for( process.communicate(), timeout=self.config.conversion_timeout ) if process.returncode != 0: self.logger.warning(f"mineru-converter: LibreOffice returned non-zero: {stderr.decode()}") return False self.logger.info(f"mineru-converter: LibreOffice conversion completed successfully") return True except asyncio.TimeoutError: self.logger.warning(f"mineru-converter: LibreOffice conversion timeout") process.kill() return False except FileNotFoundError: self.logger.warning(f"mineru-converter: LibreOffice not found at: {self.config.libreoffice_path}") return False except Exception as e: self.logger.warning(f"mineru-converter: LibreOffice conversion error: {str(e)}") return False async def _try_alternative_conversion(self, ppt_path: str, temp_dir: str, src_fileid: str) -> str: """ Try alternative PPT conversion method using python libraries. Returns: Path to converted PDF file """ try: from pptx import Presentation from reportlab.pdfgen import canvas from reportlab.lib.pagesizes import A4 pdf_path = os.path.join(temp_dir, "source.pdf") self.logger.info(f"mineru-converter: using python-pptx + reportlab conversion") # Load PPT presentation prs = Presentation(ppt_path) # Create PDF c = canvas.Canvas(pdf_path, pagesize=A4) page_width, page_height = A4 for slide_idx, slide in enumerate(prs.slides): if slide_idx > 0: c.showPage() # Add slide content y_position = page_height - 50 # Start from top margin = 50 # Extract text content from slide slide_texts = [] for shape in slide.shapes: if hasattr(shape, "text") and shape.text.strip(): slide_texts.append(shape.text.strip()) # Write text to PDF for text in slide_texts: if y_position < margin: break # Avoid writing below page margin # Split long text into lines words = text.split() line = "" for word in words: test_line = f"{line} {word}".strip() if len(test_line) > 80: # Approximate character limit per line if line: c.drawString(margin, y_position, line) y_position -= 15 if y_position < margin: break line = word else: line = test_line if line and y_position >= margin: c.drawString(margin, y_position, line) y_position -= 20 # Extra space between text blocks # Add slide number c.drawString(page_width - 100, 30, f"Slide {slide_idx + 1}") c.save() self.logger.info(f"mineru-converter: alternative conversion completed: {pdf_path}") return pdf_path except ImportError as e: raise Exception(f"Alternative conversion requires python-pptx and reportlab: {str(e)}") except Exception as e: raise Exception(f"Alternative PPT conversion failed: {str(e)}") async def _convert_doc_to_pdf(self, doc_path: str, temp_dir: str, src_fileid: str) -> str: """ Convert DOC file to PDF using LibreOffice or alternative method. Args: doc_path: Path to DOC file temp_dir: Temporary directory src_fileid: Source file ID for logging Returns: Path to converted PDF file """ try: pdf_path = os.path.join(temp_dir, "source.pdf") # Try LibreOffice conversion first success = await self._try_libreoffice_conversion(doc_path, temp_dir, src_fileid) if success: # LibreOffice generates file with original name generated_pdf = os.path.join(temp_dir, Path(doc_path).stem + ".pdf") if os.path.exists(generated_pdf): if generated_pdf != pdf_path: os.rename(generated_pdf, pdf_path) elif os.path.exists(pdf_path): pass # File already exists with correct name else: raise Exception("PDF file was not generated by LibreOffice") else: # Fallback to alternative method self.logger.warning(f"mineru-converter: LibreOffice failed, using FileConverter for DOC") pdf_path = await FileConverter.convert_doc_to_pdf(doc_path, temp_dir) if not os.path.exists(pdf_path): raise Exception("PDF conversion failed - no output file generated") self.logger.info(f"mineru-converter: DOC to PDF conversion successful: {pdf_path}") return pdf_path except Exception as e: self.logger.error(f"mineru-converter: DOC conversion error: {str(e)}") raise Exception(f"DOC to PDF conversion failed: {str(e)}") async def _convert_image_to_pdf(self, image_path: str, temp_dir: str, src_fileid: str) -> str: """ Convert image file to PDF. Args: image_path: Path to image file temp_dir: Temporary directory src_fileid: Source file ID for logging Returns: Path to converted PDF file """ try: pdf_path = os.path.join(temp_dir, "source.pdf") # Use FileConverter utility converted_path = await FileConverter.convert_image_to_pdf(image_path, temp_dir) # Rename to standard name if needed if converted_path != pdf_path and os.path.exists(converted_path): os.rename(converted_path, pdf_path) if not os.path.exists(pdf_path): raise Exception("PDF conversion failed - no output file generated") self.logger.info(f"mineru-converter: image to PDF conversion successful: {pdf_path}") return pdf_path except Exception as e: self.logger.error(f"mineru-converter: image conversion error: {str(e)}") raise Exception(f"Image to PDF conversion failed: {str(e)}") def detect_pdf_format(self, pdf_path: str, src_fileid: str) -> Tuple[bool, dict]: """ Detect if PDF was converted from PowerPoint. Args: pdf_path: Path to PDF file src_fileid: Source file ID for logging Returns: Tuple of (is_ppt_format, metadata) """ try: is_ppt, metadata = PDFDetector.is_pdf_from_ppt(pdf_path) self.logger.info(f"mineru-converter: PDF format detection: is_ppt={is_ppt}, metadata={metadata}") return is_ppt, metadata except Exception as e: self.logger.error(f"mineru-converter: PDF format detection failed: {str(e)}") return False, {} def extract_pdf_pages(self, pdf_path: str, src_fileid: str) -> list: """ Extract page information from PDF. Args: pdf_path: Path to PDF file src_fileid: Source file ID for logging Returns: List of page information """ try: pages = PDFDetector.extract_pdf_pages(pdf_path) self.logger.info(f"mineru-converter: extracted {len(pages)} pages from PDF") return pages except Exception as e: self.logger.error(f"mineru-converter: page extraction failed: {str(e)}") return []