add pptx convert
This commit is contained in:
parent
88a04fda5a
commit
86d86d9ff3
@ -131,15 +131,47 @@ class DocumentConverter:
|
|||||||
True if conversion successful, False otherwise
|
True if conversion successful, False otherwise
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
cmd = [
|
# Try direct conversion first, fallback to xvfb if needed
|
||||||
self.config.libreoffice_path,
|
use_xvfb = os.getenv('USE_XVFB', 'auto').lower()
|
||||||
'--headless',
|
|
||||||
'--convert-to', 'pdf',
|
if use_xvfb == 'auto':
|
||||||
'--outdir', temp_dir,
|
# Try direct conversion first
|
||||||
ppt_path
|
cmd = [
|
||||||
]
|
self.config.libreoffice_path,
|
||||||
|
'--headless',
|
||||||
self.logger.info(f"mineru-converter: executing LibreOffice conversion")
|
'--convert-to', 'pdf',
|
||||||
|
'--outdir', temp_dir,
|
||||||
|
ppt_path
|
||||||
|
]
|
||||||
|
use_xvfb_now = False
|
||||||
|
fallback_to_xvfb = True
|
||||||
|
elif use_xvfb == 'true':
|
||||||
|
# Force use xvfb
|
||||||
|
cmd = [
|
||||||
|
'xvfb-run',
|
||||||
|
'--auto-servernum',
|
||||||
|
'--server-args=-screen 0, 1024x768x24',
|
||||||
|
self.config.libreoffice_path,
|
||||||
|
'--headless',
|
||||||
|
'--convert-to', 'pdf',
|
||||||
|
'--outdir', temp_dir,
|
||||||
|
ppt_path
|
||||||
|
]
|
||||||
|
use_xvfb_now = True
|
||||||
|
fallback_to_xvfb = False
|
||||||
|
else:
|
||||||
|
# No xvfb
|
||||||
|
cmd = [
|
||||||
|
self.config.libreoffice_path,
|
||||||
|
'--headless',
|
||||||
|
'--convert-to', 'pdf',
|
||||||
|
'--outdir', temp_dir,
|
||||||
|
ppt_path
|
||||||
|
]
|
||||||
|
use_xvfb_now = False
|
||||||
|
fallback_to_xvfb = False
|
||||||
|
|
||||||
|
self.logger.info(f"mineru-converter: executing LibreOffice conversion (use_xvfb={use_xvfb}, fallback={fallback_to_xvfb})")
|
||||||
|
|
||||||
# Run with timeout
|
# Run with timeout
|
||||||
process = await asyncio.create_subprocess_exec(
|
process = await asyncio.create_subprocess_exec(
|
||||||
@ -156,8 +188,47 @@ class DocumentConverter:
|
|||||||
|
|
||||||
if process.returncode != 0:
|
if process.returncode != 0:
|
||||||
self.logger.warning(f"mineru-converter: LibreOffice returned non-zero: {stderr.decode()}")
|
self.logger.warning(f"mineru-converter: LibreOffice returned non-zero: {stderr.decode()}")
|
||||||
return False
|
# If direct conversion failed and we have xvfb fallback enabled
|
||||||
|
if fallback_to_xvfb and not use_xvfb_now:
|
||||||
|
self.logger.info(f"mineru-converter: direct conversion failed, retrying with xvfb")
|
||||||
|
fallback_cmd = [
|
||||||
|
'xvfb-run',
|
||||||
|
'--auto-servernum',
|
||||||
|
'--server-args=-screen 0, 1024x768x24',
|
||||||
|
self.config.libreoffice_path,
|
||||||
|
'--headless',
|
||||||
|
'--convert-to', 'pdf',
|
||||||
|
'--outdir', temp_dir,
|
||||||
|
ppt_path
|
||||||
|
]
|
||||||
|
|
||||||
|
# Try again with xvfb
|
||||||
|
process_xvfb = await asyncio.create_subprocess_exec(
|
||||||
|
*fallback_cmd,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
stdout_xvfb, stderr_xvfb = await asyncio.wait_for(
|
||||||
|
process_xvfb.communicate(),
|
||||||
|
timeout=self.config.conversion_timeout
|
||||||
|
)
|
||||||
|
|
||||||
|
if process_xvfb.returncode == 0:
|
||||||
|
self.logger.info(f"mineru-converter: LibreOffice conversion with xvfb completed successfully")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"mineru-converter: LibreOffice with xvfb also failed: {stderr_xvfb.decode()}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
self.logger.warning(f"mineru-converter: LibreOffice with xvfb conversion timeout")
|
||||||
|
process_xvfb.kill()
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
self.logger.info(f"mineru-converter: LibreOffice conversion completed successfully")
|
self.logger.info(f"mineru-converter: LibreOffice conversion completed successfully")
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|||||||
@ -76,24 +76,18 @@ ENV MAXKB_VERSION="${DOCKER_IMAGE_TAG} (build at ${BUILD_AT}, commit: ${GITHUB_C
|
|||||||
PIP_TARGET=/opt/maxkb/python-packages
|
PIP_TARGET=/opt/maxkb/python-packages
|
||||||
|
|
||||||
# Install poppler-utils for PDF processing (required by MinerU)
|
# Install poppler-utils for PDF processing (required by MinerU)
|
||||||
# Install X11 libraries for LibreOffice (required for headless operation)
|
# Install essential libraries for LibreOffice headless operation
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y --no-install-recommends \
|
apt-get install -y --no-install-recommends \
|
||||||
poppler-utils \
|
poppler-utils \
|
||||||
libxinerama1 \
|
libxinerama1 \
|
||||||
libxi6 \
|
libdbus-1-3 \
|
||||||
libxrender1 \
|
|
||||||
libxtst6 \
|
|
||||||
libxrandr2 \
|
|
||||||
libxext6 \
|
|
||||||
libxfixes3 \
|
|
||||||
libxcursor1 \
|
|
||||||
libxcomposite1 \
|
|
||||||
libxdamage1 \
|
|
||||||
libxss1 \
|
|
||||||
libxt6 \
|
|
||||||
libsm6 \
|
libsm6 \
|
||||||
libice6 && \
|
libice6 \
|
||||||
|
libxt6 \
|
||||||
|
libglib2.0-0 \
|
||||||
|
libcups2 \
|
||||||
|
xvfb && \
|
||||||
apt-get clean all && \
|
apt-get clean all && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|||||||
@ -33,6 +33,10 @@ services:
|
|||||||
|
|
||||||
# LibreOffice 路径配置
|
# LibreOffice 路径配置
|
||||||
LIBREOFFICE_PATH: "soffice"
|
LIBREOFFICE_PATH: "soffice"
|
||||||
|
# LibreOffice headless mode 环境变量
|
||||||
|
DISPLAY: ":99"
|
||||||
|
# xvfb 模式: auto(自动检测是否需要), true(强制使用), false(不使用)
|
||||||
|
USE_XVFB: "auto"
|
||||||
|
|
||||||
# MINERU_API_TYPE: "cloud"
|
# MINERU_API_TYPE: "cloud"
|
||||||
# MINERU_API_URL: "https://mineru.net"
|
# MINERU_API_URL: "https://mineru.net"
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user