修复mineru的json解析报错
This commit is contained in:
parent
356856def4
commit
b16afa5299
2
.gitignore
vendored
2
.gitignore
vendored
@ -40,7 +40,7 @@ share/python-wheels/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
uploads/
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script forms a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
|
||||
2
CLAUDE.md
Normal file
2
CLAUDE.md
Normal file
@ -0,0 +1,2 @@
|
||||
这个项目基于docker部署的,目前部署在 http://127.0.0.1:2008, 通过 installer/docker-compose.yml 部署的。
|
||||
如果修改代码之后需要重启,就需要重启 docker-compose 容器。
|
||||
@ -697,13 +697,32 @@ class BaseMinerUExtractor:
|
||||
cache_status_path = os.path.join(temp_dir, 'cache_status.json')
|
||||
|
||||
try:
|
||||
# First try to load existing status to preserve completed_indices
|
||||
if os.path.exists(cache_status_path):
|
||||
with open(cache_status_path, 'r', encoding='utf-8') as f:
|
||||
status = json.load(f)
|
||||
else:
|
||||
status = {
|
||||
'status': 'complete',
|
||||
'completed_pages': total_pages,
|
||||
'total_pages': total_pages,
|
||||
'completed_indices': [],
|
||||
'started_at': time.time(),
|
||||
'updated_at': time.time(),
|
||||
'completed_at': time.time()
|
||||
}
|
||||
|
||||
# Update status to complete
|
||||
status['status'] = 'complete'
|
||||
status['completed_pages'] = total_pages
|
||||
status['total_pages'] = total_pages
|
||||
status['completed_at'] = time.time()
|
||||
status['updated_at'] = time.time()
|
||||
|
||||
# Ensure completed_indices exists
|
||||
if 'completed_indices' not in status:
|
||||
status['completed_indices'] = []
|
||||
|
||||
with open(cache_status_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(status, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
@ -95,7 +95,7 @@ class MinerUConfig:
|
||||
|
||||
# Queue Processing Configuration
|
||||
queue_size: int = 50
|
||||
processing_timeout: int = 600
|
||||
processing_timeout: int = 7200
|
||||
num_parser_threads: int = 1
|
||||
num_refiner_threads: int = 1
|
||||
num_recognizer_threads: int = 1
|
||||
|
||||
@ -543,9 +543,6 @@ class MinerUImageProcessor:
|
||||
# Build context-aware prompt with language
|
||||
prompt = self._build_context_aware_prompt(context, language_code)
|
||||
|
||||
# Log the final prompt for debugging
|
||||
self.logger.info(f"mineru-image: Final prompt for {os.path.basename(image_filepath)}:\n{prompt[:1000]}...")
|
||||
|
||||
messages = [
|
||||
{'role': 'system', 'content': prompt},
|
||||
{'role': 'user', 'content': [
|
||||
@ -681,8 +678,59 @@ class MinerUImageProcessor:
|
||||
json_end = response_content.find('```', json_start)
|
||||
response_content = response_content[json_start:json_end].strip()
|
||||
|
||||
# Parse JSON
|
||||
# Parse JSON with proper handling of multiline strings
|
||||
import re
|
||||
import ast
|
||||
|
||||
try:
|
||||
# First try direct parsing (handles properly formatted JSON)
|
||||
result_json = json.loads(response_content)
|
||||
except json.JSONDecodeError as e:
|
||||
# If parsing fails, try to fix unescaped newlines in string values
|
||||
self.logger.debug(f"mineru-image: Initial JSON parse failed, attempting recovery: {str(e)}")
|
||||
|
||||
# Extract JSON from markdown if present
|
||||
if '```json' in response_content and '```' in response_content:
|
||||
json_start = response_content.find('```json') + 7
|
||||
json_end = response_content.find('```', json_start)
|
||||
json_str = response_content[json_start:json_end].strip()
|
||||
else:
|
||||
# Extract JSON between first { and last }
|
||||
start_idx = response_content.find('{')
|
||||
end_idx = response_content.rfind('}')
|
||||
if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
|
||||
json_str = response_content[start_idx:end_idx + 1]
|
||||
else:
|
||||
json_str = response_content
|
||||
|
||||
# Method 1: Try using ast.literal_eval which is more forgiving
|
||||
try:
|
||||
# Replace single quotes with double quotes for JSON compatibility
|
||||
json_str_fixed = json_str.replace("'", '"')
|
||||
result_dict = ast.literal_eval(json_str_fixed)
|
||||
result_json = dict(result_dict) # Convert to regular dict
|
||||
except:
|
||||
# Method 2: Manually fix newlines in string values
|
||||
# Use regex to find string values and escape newlines within them
|
||||
def escape_newlines_in_strings(match):
|
||||
string_content = match.group(1)
|
||||
# Escape newlines and other special chars within the string
|
||||
string_content = string_content.replace('\\', '\\\\') # Escape backslashes first
|
||||
string_content = string_content.replace('"', '\\"') # Escape quotes
|
||||
string_content = string_content.replace('\n', '\\n') # Escape newlines
|
||||
string_content = string_content.replace('\r', '\\r') # Escape carriage returns
|
||||
string_content = string_content.replace('\t', '\\t') # Escape tabs
|
||||
return f'"{string_content}"'
|
||||
|
||||
# Pattern to match string values in JSON
|
||||
pattern = r'"([^"\\]*(?:\\.[^"\\]*)*)"'
|
||||
json_str = re.sub(pattern, escape_newlines_in_strings, json_str)
|
||||
|
||||
# Remove any remaining invalid control characters
|
||||
json_str = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', json_str)
|
||||
|
||||
# Try parsing the fixed JSON
|
||||
result_json = json.loads(json_str)
|
||||
|
||||
# Log the raw classification response for debugging
|
||||
self.logger.info(f"mineru-image: parsed JSON response: {result_json}")
|
||||
|
||||
@ -300,7 +300,7 @@ class MinerUAdapter:
|
||||
|
||||
thread = threading.Thread(target=run_async)
|
||||
thread.start()
|
||||
thread.join(timeout=300) # 5分钟超时
|
||||
thread.join(timeout=int(os.environ.get('MINERU_PROCESSING_TIMEOUT', 7200))) # 2h超时
|
||||
|
||||
if exception:
|
||||
raise exception
|
||||
|
||||
@ -592,7 +592,6 @@ class ParallelMinerUProcessor:
|
||||
img_filepath, img_filename, xref
|
||||
)
|
||||
images_to_process.append(image_info)
|
||||
self.logger.info(f"Recognizer: loaded image info: {image_info}")
|
||||
else:
|
||||
self.logger.warning(f"Recognizer: image file not found: {img_filepath}")
|
||||
|
||||
|
||||
@ -14,13 +14,13 @@ RUN cd ui && ls -la && if [ -d "dist" ]; then exit 0; fi && \
|
||||
FROM ghcr.io/1panel-dev/maxkb-base:python3.11-pg17.6 AS stage-build
|
||||
|
||||
# Configure apt to use Aliyun mirror (for Debian 12 bookworm)
|
||||
RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \
|
||||
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources && \
|
||||
sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \
|
||||
else \
|
||||
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
|
||||
sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \
|
||||
fi
|
||||
#RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \
|
||||
# sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources && \
|
||||
# sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \
|
||||
# else \
|
||||
# sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
|
||||
# sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \
|
||||
# fi
|
||||
|
||||
ARG DEPENDENCIES=" \
|
||||
python3-pip \
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
version: '3.8'
|
||||
version: "3.8"
|
||||
|
||||
services:
|
||||
maxkb-dev:
|
||||
image: 192.168.101.129:2009/maxkb-dev:v1
|
||||
image: maxkb-local
|
||||
# 如果需要本地构建,取消下面两行注释
|
||||
#build:
|
||||
# build:
|
||||
# context: ../
|
||||
# dockerfile: installer/Dockerfile
|
||||
container_name: maxkb-dev
|
||||
@ -13,6 +13,7 @@ services:
|
||||
volumes:
|
||||
# 持久化数据
|
||||
- ~/.maxkb:/opt/maxkb/
|
||||
- ../apps:/opt/maxkb-app/apps
|
||||
- /tmp/maxkb:/tmp
|
||||
environment:
|
||||
# 开启调试模式
|
||||
@ -26,12 +27,12 @@ services:
|
||||
# MinerU 图片存储路径
|
||||
MAXKB_STORAGE_PATH: "/opt/maxkb/storage"
|
||||
|
||||
#MINERU_API_TYPE: "self_hosted"
|
||||
#MINERU_API_URL: "http://mineru:8000"
|
||||
MINERU_API_TYPE: "self_hosted"
|
||||
MINERU_API_URL: "http://192.168.102.5:9987"
|
||||
|
||||
MINERU_API_TYPE: "cloud"
|
||||
MINERU_API_URL: "https://mineru.net"
|
||||
MINERU_API_KEY: "eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiI2OTYwMDEwNiIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc1NTE2MzQ5OCwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiIiwib3BlbklkIjpudWxsLCJ1dWlkIjoiYTQwODk5NjMtNDI1OS00MWM3LWE3NWItY2IzZTQ4NTRjYWIwIiwiZW1haWwiOiIiLCJleHAiOjE3NTYzNzMwOTh9.88m9JSKQhkwJ557jCTFOgmdjeAfpXzxy2QDINaJ0rfHfnMNBxQt47aHr2jABeuxW-fXm8S5AO7zWWTXEGx8BxA"
|
||||
# MINERU_API_TYPE: "cloud"
|
||||
# MINERU_API_URL: "https://mineru.net"
|
||||
# MINERU_API_KEY: "eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiI2OTYwMDEwNiIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc1NTE2MzQ5OCwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiIiwib3BlbklkIjpudWxsLCJ1dWlkIjoiYTQwODk5NjMtNDI1OS00MWM3LWE3NWItY2IzZTQ4NTRjYWIwIiwiZW1haWwiOiIiLCJleHAiOjE3NTYzNzMwOTh9.88m9JSKQhkwJ557jCTFOgmdjeAfpXzxy2QDINaJ0rfHfnMNBxQt47aHr2jABeuxW-fXm8S5AO7zWWTXEGx8BxA"
|
||||
|
||||
# 配置队列大小
|
||||
MINERU_QUEUE_SIZE: "50"
|
||||
@ -59,8 +60,6 @@ services:
|
||||
MINERU_FILTER_MEANINGLESS: "true"
|
||||
# 使用默认的启动命令,让容器内的所有服务正常启动
|
||||
# command 留空使用镜像默认的 entrypoint
|
||||
|
||||
|
||||
# 如果使用绑定挂载,可以删除这个 volumes 部分
|
||||
# volumes:
|
||||
# maxkb_data:
|
||||
|
||||
@ -49,8 +49,10 @@ dependencies = [
|
||||
"python-docx==1.2.0",
|
||||
"xlrd==2.0.2",
|
||||
"xlwt==1.3.0",
|
||||
"pymupdf==1.26.3",
|
||||
"pypdf==6.0.0",
|
||||
"python-pptx==1.0.2",
|
||||
"reportlab==4.2.5",
|
||||
"pymupdf==1.26.3",
|
||||
"pydub==0.25.1",
|
||||
"pillow==11.0.0",
|
||||
"pdf2image==1.17.0",
|
||||
|
||||
Loading…
Reference in New Issue
Block a user