maxkb/apps/common/handle/impl/mineru/converter.py
2025-08-24 00:56:02 +08:00

367 lines
15 KiB
Python

"""
File conversion module for MinerU-based parsing.
This module handles file type detection, PPT to PDF conversion,
and related file processing operations.
"""
import os
import asyncio
import subprocess
from pathlib import Path
from typing import Tuple, Optional
from .logger import get_module_logger
logger = get_module_logger('converter')
from .utils import FileConverter, PDFDetector
from .config_base import MinerUConfig
class DocumentConverter:
"""Document conversion handler with PPT support"""
def __init__(self, config: MinerUConfig):
self.config = config
self.logger = logger
async def handle_file_input(self, file_path: str, temp_dir: str, src_fileid: str) -> Tuple[str, bool]:
"""
Handle input file processing - detect type and convert if needed.
Args:
file_path: Input file path
temp_dir: Temporary directory for processing
src_fileid: Source file ID for logging
Returns:
Tuple of (pdf_path, is_ppt_source)
"""
try:
file_type = FileConverter.detect_file_type(file_path)
self.logger.info(f"mineru-converter: detected file type: {file_type}")
# Validate file size
if not FileConverter.validate_file_size(file_path, self.config.max_file_size / (1024 * 1024)):
raise ValueError(f"File size exceeds maximum allowed size of {self.config.max_file_size / (1024 * 1024):.1f}MB")
if file_type in ['ppt', 'pptx']:
# PPT file needs conversion
self.logger.info(f"mineru-converter: converting PPT to PDF")
pdf_path = await self._convert_ppt_to_pdf(file_path, temp_dir, src_fileid)
return pdf_path, True # is_ppt_source = True
elif file_type in ['doc', 'docx']:
# DOC file needs conversion (treat similar to PPT)
self.logger.info(f"mineru-converter: converting DOC to PDF")
pdf_path = await self._convert_doc_to_pdf(file_path, temp_dir, src_fileid)
return pdf_path, True # is_ppt_source = True (DOC treated like PPT)
elif file_type == 'image':
# Image file needs conversion
self.logger.info(f"mineru-converter: converting image to PDF")
pdf_path = await self._convert_image_to_pdf(file_path, temp_dir, src_fileid)
return pdf_path, False # is_ppt_source = False (images are not PPT-like)
elif file_type == 'pdf':
# PDF file - copy to temp directory for processing
self.logger.info(f"mineru-converter: processing PDF directly")
pdf_path = os.path.join(temp_dir, f"source.pdf")
# Copy file to temp directory
import shutil
shutil.copy(file_path, pdf_path)
return pdf_path, False # is_ppt_source = False
else:
raise ValueError(f"Unsupported file type: {file_type}")
except Exception as e:
self.logger.error(f"mineru-converter: file input handling failed: {str(e)}")
raise
async def _convert_ppt_to_pdf(self, ppt_path: str, temp_dir: str, src_fileid: str) -> str:
"""
Convert PPT file to PDF using LibreOffice or alternative method.
Args:
ppt_path: Path to PPT file
temp_dir: Temporary directory
src_fileid: Source file ID for logging
Returns:
Path to converted PDF file
"""
try:
pdf_path = os.path.join(temp_dir, "source.pdf")
# Try LibreOffice conversion first
success = await self._try_libreoffice_conversion(ppt_path, temp_dir, src_fileid)
if success:
# LibreOffice generates file with original name
generated_pdf = os.path.join(temp_dir, Path(ppt_path).stem + ".pdf")
if os.path.exists(generated_pdf):
if generated_pdf != pdf_path:
os.rename(generated_pdf, pdf_path)
elif os.path.exists(pdf_path):
pass # File already exists with correct name
else:
raise Exception("PDF file was not generated by LibreOffice")
else:
# Fallback to alternative method
self.logger.warning(f"mineru-converter: LibreOffice failed, trying alternative conversion")
pdf_path = await self._try_alternative_conversion(ppt_path, temp_dir, src_fileid)
if not os.path.exists(pdf_path):
raise Exception("PDF conversion failed - no output file generated")
self.logger.info(f"mineru-converter: PPT to PDF conversion successful: {pdf_path}")
return pdf_path
except Exception as e:
self.logger.error(f"mineru-converter: PPT conversion error: {str(e)}")
raise Exception(f"PPT to PDF conversion failed: {str(e)}")
async def _try_libreoffice_conversion(self, ppt_path: str, temp_dir: str, src_fileid: str) -> bool:
"""
Try converting PPT using LibreOffice.
Returns:
True if conversion successful, False otherwise
"""
try:
cmd = [
self.config.libreoffice_path,
'--headless',
'--convert-to', 'pdf',
'--outdir', temp_dir,
ppt_path
]
self.logger.info(f"mineru-converter: executing LibreOffice conversion")
# Run with timeout
process = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
try:
stdout, stderr = await asyncio.wait_for(
process.communicate(),
timeout=self.config.conversion_timeout
)
if process.returncode != 0:
self.logger.warning(f"mineru-converter: LibreOffice returned non-zero: {stderr.decode()}")
return False
self.logger.info(f"mineru-converter: LibreOffice conversion completed successfully")
return True
except asyncio.TimeoutError:
self.logger.warning(f"mineru-converter: LibreOffice conversion timeout")
process.kill()
return False
except FileNotFoundError:
self.logger.warning(f"mineru-converter: LibreOffice not found at: {self.config.libreoffice_path}")
return False
except Exception as e:
self.logger.warning(f"mineru-converter: LibreOffice conversion error: {str(e)}")
return False
async def _try_alternative_conversion(self, ppt_path: str, temp_dir: str, src_fileid: str) -> str:
"""
Try alternative PPT conversion method using python libraries.
Returns:
Path to converted PDF file
"""
try:
from pptx import Presentation
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
pdf_path = os.path.join(temp_dir, "source.pdf")
self.logger.info(f"mineru-converter: using python-pptx + reportlab conversion")
# Load PPT presentation
prs = Presentation(ppt_path)
# Create PDF
c = canvas.Canvas(pdf_path, pagesize=A4)
page_width, page_height = A4
for slide_idx, slide in enumerate(prs.slides):
if slide_idx > 0:
c.showPage()
# Add slide content
y_position = page_height - 50 # Start from top
margin = 50
# Extract text content from slide
slide_texts = []
for shape in slide.shapes:
if hasattr(shape, "text") and shape.text.strip():
slide_texts.append(shape.text.strip())
# Write text to PDF
for text in slide_texts:
if y_position < margin:
break # Avoid writing below page margin
# Split long text into lines
words = text.split()
line = ""
for word in words:
test_line = f"{line} {word}".strip()
if len(test_line) > 80: # Approximate character limit per line
if line:
c.drawString(margin, y_position, line)
y_position -= 15
if y_position < margin:
break
line = word
else:
line = test_line
if line and y_position >= margin:
c.drawString(margin, y_position, line)
y_position -= 20 # Extra space between text blocks
# Add slide number
c.drawString(page_width - 100, 30, f"Slide {slide_idx + 1}")
c.save()
self.logger.info(f"mineru-converter: alternative conversion completed: {pdf_path}")
return pdf_path
except ImportError as e:
raise Exception(f"Alternative conversion requires python-pptx and reportlab: {str(e)}")
except Exception as e:
raise Exception(f"Alternative PPT conversion failed: {str(e)}")
async def _convert_doc_to_pdf(self, doc_path: str, temp_dir: str, src_fileid: str) -> str:
"""
Convert DOC file to PDF using LibreOffice or alternative method.
Args:
doc_path: Path to DOC file
temp_dir: Temporary directory
src_fileid: Source file ID for logging
Returns:
Path to converted PDF file
"""
try:
pdf_path = os.path.join(temp_dir, "source.pdf")
# Try LibreOffice conversion first
success = await self._try_libreoffice_conversion(doc_path, temp_dir, src_fileid)
if success:
# LibreOffice generates file with original name
generated_pdf = os.path.join(temp_dir, Path(doc_path).stem + ".pdf")
if os.path.exists(generated_pdf):
if generated_pdf != pdf_path:
os.rename(generated_pdf, pdf_path)
elif os.path.exists(pdf_path):
pass # File already exists with correct name
else:
raise Exception("PDF file was not generated by LibreOffice")
else:
# Fallback to alternative method
self.logger.warning(f"mineru-converter: LibreOffice failed, using FileConverter for DOC")
pdf_path = await FileConverter.convert_doc_to_pdf(doc_path, temp_dir)
if not os.path.exists(pdf_path):
raise Exception("PDF conversion failed - no output file generated")
self.logger.info(f"mineru-converter: DOC to PDF conversion successful: {pdf_path}")
return pdf_path
except Exception as e:
self.logger.error(f"mineru-converter: DOC conversion error: {str(e)}")
raise Exception(f"DOC to PDF conversion failed: {str(e)}")
async def _convert_image_to_pdf(self, image_path: str, temp_dir: str, src_fileid: str) -> str:
"""
Convert image file to PDF.
Args:
image_path: Path to image file
temp_dir: Temporary directory
src_fileid: Source file ID for logging
Returns:
Path to converted PDF file
"""
try:
pdf_path = os.path.join(temp_dir, "source.pdf")
# Use FileConverter utility
converted_path = await FileConverter.convert_image_to_pdf(image_path, temp_dir)
# Rename to standard name if needed
if converted_path != pdf_path and os.path.exists(converted_path):
os.rename(converted_path, pdf_path)
if not os.path.exists(pdf_path):
raise Exception("PDF conversion failed - no output file generated")
self.logger.info(f"mineru-converter: image to PDF conversion successful: {pdf_path}")
return pdf_path
except Exception as e:
self.logger.error(f"mineru-converter: image conversion error: {str(e)}")
raise Exception(f"Image to PDF conversion failed: {str(e)}")
def detect_pdf_format(self, pdf_path: str, src_fileid: str) -> Tuple[bool, dict]:
"""
Detect if PDF was converted from PowerPoint.
Args:
pdf_path: Path to PDF file
src_fileid: Source file ID for logging
Returns:
Tuple of (is_ppt_format, metadata)
"""
try:
is_ppt, metadata = PDFDetector.is_pdf_from_ppt(pdf_path)
self.logger.info(f"mineru-converter: PDF format detection: is_ppt={is_ppt}, metadata={metadata}")
return is_ppt, metadata
except Exception as e:
self.logger.error(f"mineru-converter: PDF format detection failed: {str(e)}")
return False, {}
def extract_pdf_pages(self, pdf_path: str, src_fileid: str) -> list:
"""
Extract page information from PDF.
Args:
pdf_path: Path to PDF file
src_fileid: Source file ID for logging
Returns:
List of page information
"""
try:
pages = PDFDetector.extract_pdf_pages(pdf_path)
self.logger.info(f"mineru-converter: extracted {len(pages)} pages from PDF")
return pages
except Exception as e:
self.logger.error(f"mineru-converter: page extraction failed: {str(e)}")
return []