修复mineru的json解析报错
Some checks are pending
sync2gitee / repo-sync (push) Waiting to run
Typos Check / Spell Check with Typos (push) Waiting to run

This commit is contained in:
朱潮 2025-12-18 12:59:14 +08:00
parent 356856def4
commit b16afa5299
10 changed files with 115 additions and 46 deletions

2
.gitignore vendored
View File

@ -40,7 +40,7 @@ share/python-wheels/
.installed.cfg
*.egg
MANIFEST
uploads/
# PyInstaller
# Usually these files are written by a python script forms a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.

2
CLAUDE.md Normal file
View File

@ -0,0 +1,2 @@
这个项目基于docker部署的目前部署在 http://127.0.0.1:2008, 通过 installer/docker-compose.yml 部署的。
如果修改代码之后需要重启,就需要重启 docker-compose 容器。

View File

@ -697,12 +697,31 @@ class BaseMinerUExtractor:
cache_status_path = os.path.join(temp_dir, 'cache_status.json')
try:
status = {
'status': 'complete',
'completed_pages': total_pages,
'total_pages': total_pages,
'completed_at': time.time()
}
# First try to load existing status to preserve completed_indices
if os.path.exists(cache_status_path):
with open(cache_status_path, 'r', encoding='utf-8') as f:
status = json.load(f)
else:
status = {
'status': 'complete',
'completed_pages': total_pages,
'total_pages': total_pages,
'completed_indices': [],
'started_at': time.time(),
'updated_at': time.time(),
'completed_at': time.time()
}
# Update status to complete
status['status'] = 'complete'
status['completed_pages'] = total_pages
status['total_pages'] = total_pages
status['completed_at'] = time.time()
status['updated_at'] = time.time()
# Ensure completed_indices exists
if 'completed_indices' not in status:
status['completed_indices'] = []
with open(cache_status_path, 'w', encoding='utf-8') as f:
json.dump(status, f, ensure_ascii=False, indent=2)

View File

@ -95,7 +95,7 @@ class MinerUConfig:
# Queue Processing Configuration
queue_size: int = 50
processing_timeout: int = 600
processing_timeout: int = 7200
num_parser_threads: int = 1
num_refiner_threads: int = 1
num_recognizer_threads: int = 1

View File

@ -543,9 +543,6 @@ class MinerUImageProcessor:
# Build context-aware prompt with language
prompt = self._build_context_aware_prompt(context, language_code)
# Log the final prompt for debugging
self.logger.info(f"mineru-image: Final prompt for {os.path.basename(image_filepath)}:\n{prompt[:1000]}...")
messages = [
{'role': 'system', 'content': prompt},
{'role': 'user', 'content': [
@ -681,8 +678,59 @@ class MinerUImageProcessor:
json_end = response_content.find('```', json_start)
response_content = response_content[json_start:json_end].strip()
# Parse JSON
result_json = json.loads(response_content)
# Parse JSON with proper handling of multiline strings
import re
import ast
try:
# First try direct parsing (handles properly formatted JSON)
result_json = json.loads(response_content)
except json.JSONDecodeError as e:
# If parsing fails, try to fix unescaped newlines in string values
self.logger.debug(f"mineru-image: Initial JSON parse failed, attempting recovery: {str(e)}")
# Extract JSON from markdown if present
if '```json' in response_content and '```' in response_content:
json_start = response_content.find('```json') + 7
json_end = response_content.find('```', json_start)
json_str = response_content[json_start:json_end].strip()
else:
# Extract JSON between first { and last }
start_idx = response_content.find('{')
end_idx = response_content.rfind('}')
if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
json_str = response_content[start_idx:end_idx + 1]
else:
json_str = response_content
# Method 1: Try using ast.literal_eval which is more forgiving
try:
# Replace single quotes with double quotes for JSON compatibility
json_str_fixed = json_str.replace("'", '"')
result_dict = ast.literal_eval(json_str_fixed)
result_json = dict(result_dict) # Convert to regular dict
except:
# Method 2: Manually fix newlines in string values
# Use regex to find string values and escape newlines within them
def escape_newlines_in_strings(match):
string_content = match.group(1)
# Escape newlines and other special chars within the string
string_content = string_content.replace('\\', '\\\\') # Escape backslashes first
string_content = string_content.replace('"', '\\"') # Escape quotes
string_content = string_content.replace('\n', '\\n') # Escape newlines
string_content = string_content.replace('\r', '\\r') # Escape carriage returns
string_content = string_content.replace('\t', '\\t') # Escape tabs
return f'"{string_content}"'
# Pattern to match string values in JSON
pattern = r'"([^"\\]*(?:\\.[^"\\]*)*)"'
json_str = re.sub(pattern, escape_newlines_in_strings, json_str)
# Remove any remaining invalid control characters
json_str = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', json_str)
# Try parsing the fixed JSON
result_json = json.loads(json_str)
# Log the raw classification response for debugging
self.logger.info(f"mineru-image: parsed JSON response: {result_json}")

View File

@ -300,7 +300,7 @@ class MinerUAdapter:
thread = threading.Thread(target=run_async)
thread.start()
thread.join(timeout=300) # 5分钟超时
thread.join(timeout=int(os.environ.get('MINERU_PROCESSING_TIMEOUT', 7200))) # 2h超时
if exception:
raise exception

View File

@ -592,7 +592,6 @@ class ParallelMinerUProcessor:
img_filepath, img_filename, xref
)
images_to_process.append(image_info)
self.logger.info(f"Recognizer: loaded image info: {image_info}")
else:
self.logger.warning(f"Recognizer: image file not found: {img_filepath}")

View File

@ -14,13 +14,13 @@ RUN cd ui && ls -la && if [ -d "dist" ]; then exit 0; fi && \
FROM ghcr.io/1panel-dev/maxkb-base:python3.11-pg17.6 AS stage-build
# Configure apt to use Aliyun mirror (for Debian 12 bookworm)
RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources && \
sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \
else \
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \
fi
#RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \
# sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources && \
# sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \
# else \
# sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
# sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \
# fi
ARG DEPENDENCIES=" \
python3-pip \

View File

@ -1,18 +1,19 @@
version: '3.8'
version: "3.8"
services:
maxkb-dev:
image: 192.168.101.129:2009/maxkb-dev:v1
image: maxkb-local
# 如果需要本地构建,取消下面两行注释
#build:
# context: ../
# dockerfile: installer/Dockerfile
# build:
# context: ../
# dockerfile: installer/Dockerfile
container_name: maxkb-dev
ports:
- "2008:8080"
volumes:
# 持久化数据
- ~/.maxkb:/opt/maxkb/
- ../apps:/opt/maxkb-app/apps
- /tmp/maxkb:/tmp
environment:
# 开启调试模式
@ -26,12 +27,12 @@ services:
# MinerU 图片存储路径
MAXKB_STORAGE_PATH: "/opt/maxkb/storage"
#MINERU_API_TYPE: "self_hosted"
#MINERU_API_URL: "http://mineru:8000"
MINERU_API_TYPE: "self_hosted"
MINERU_API_URL: "http://192.168.102.5:9987"
MINERU_API_TYPE: "cloud"
MINERU_API_URL: "https://mineru.net"
MINERU_API_KEY: "eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiI2OTYwMDEwNiIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc1NTE2MzQ5OCwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiIiwib3BlbklkIjpudWxsLCJ1dWlkIjoiYTQwODk5NjMtNDI1OS00MWM3LWE3NWItY2IzZTQ4NTRjYWIwIiwiZW1haWwiOiIiLCJleHAiOjE3NTYzNzMwOTh9.88m9JSKQhkwJ557jCTFOgmdjeAfpXzxy2QDINaJ0rfHfnMNBxQt47aHr2jABeuxW-fXm8S5AO7zWWTXEGx8BxA"
# MINERU_API_TYPE: "cloud"
# MINERU_API_URL: "https://mineru.net"
# MINERU_API_KEY: "eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiI2OTYwMDEwNiIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc1NTE2MzQ5OCwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiIiwib3BlbklkIjpudWxsLCJ1dWlkIjoiYTQwODk5NjMtNDI1OS00MWM3LWE3NWItY2IzZTQ4NTRjYWIwIiwiZW1haWwiOiIiLCJleHAiOjE3NTYzNzMwOTh9.88m9JSKQhkwJ557jCTFOgmdjeAfpXzxy2QDINaJ0rfHfnMNBxQt47aHr2jABeuxW-fXm8S5AO7zWWTXEGx8BxA"
# 配置队列大小
MINERU_QUEUE_SIZE: "50"
@ -59,8 +60,6 @@ services:
MINERU_FILTER_MEANINGLESS: "true"
# 使用默认的启动命令,让容器内的所有服务正常启动
# command 留空使用镜像默认的 entrypoint
# 如果使用绑定挂载,可以删除这个 volumes 部分
# volumes:
# maxkb_data:

View File

@ -49,8 +49,10 @@ dependencies = [
"python-docx==1.2.0",
"xlrd==2.0.2",
"xlwt==1.3.0",
"pymupdf==1.26.3",
"pypdf==6.0.0",
"python-pptx==1.0.2",
"reportlab==4.2.5",
"pymupdf==1.26.3",
"pydub==0.25.1",
"pillow==11.0.0",
"pdf2image==1.17.0",