diff --git a/.gitignore b/.gitignore index 8d6edcb8..35358b5b 100644 --- a/.gitignore +++ b/.gitignore @@ -40,7 +40,7 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST - +uploads/ # PyInstaller # Usually these files are written by a python script forms a template # before PyInstaller builds the exe, so as to inject date/other infos into it. diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..5d78a79b --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,2 @@ +这个项目基于docker部署的,目前部署在 http://127.0.0.1:2008, 通过 installer/docker-compose.yml 部署的。 +如果修改代码之后需要重启,就需要重启 docker-compose 容器。 \ No newline at end of file diff --git a/apps/common/handle/impl/mineru/base_parser.py b/apps/common/handle/impl/mineru/base_parser.py index cc899061..cb31cee9 100644 --- a/apps/common/handle/impl/mineru/base_parser.py +++ b/apps/common/handle/impl/mineru/base_parser.py @@ -695,15 +695,34 @@ class BaseMinerUExtractor: def _mark_cache_complete(self, temp_dir: str, total_pages: int): """标记缓存完成 - 公共方法""" cache_status_path = os.path.join(temp_dir, 'cache_status.json') - + try: - status = { - 'status': 'complete', - 'completed_pages': total_pages, - 'total_pages': total_pages, - 'completed_at': time.time() - } - + # First try to load existing status to preserve completed_indices + if os.path.exists(cache_status_path): + with open(cache_status_path, 'r', encoding='utf-8') as f: + status = json.load(f) + else: + status = { + 'status': 'complete', + 'completed_pages': total_pages, + 'total_pages': total_pages, + 'completed_indices': [], + 'started_at': time.time(), + 'updated_at': time.time(), + 'completed_at': time.time() + } + + # Update status to complete + status['status'] = 'complete' + status['completed_pages'] = total_pages + status['total_pages'] = total_pages + status['completed_at'] = time.time() + status['updated_at'] = time.time() + + # Ensure completed_indices exists + if 'completed_indices' not in status: + status['completed_indices'] = [] + with open(cache_status_path, 'w', encoding='utf-8') as f: json.dump(status, f, ensure_ascii=False, indent=2) diff --git a/apps/common/handle/impl/mineru/config_base.py b/apps/common/handle/impl/mineru/config_base.py index a7d1d8af..b5388076 100644 --- a/apps/common/handle/impl/mineru/config_base.py +++ b/apps/common/handle/impl/mineru/config_base.py @@ -95,7 +95,7 @@ class MinerUConfig: # Queue Processing Configuration queue_size: int = 50 - processing_timeout: int = 600 + processing_timeout: int = 7200 num_parser_threads: int = 1 num_refiner_threads: int = 1 num_recognizer_threads: int = 1 @@ -255,4 +255,4 @@ class MinerUConfig: **call_kwargs ) - return response \ No newline at end of file + return response diff --git a/apps/common/handle/impl/mineru/image_processor.py b/apps/common/handle/impl/mineru/image_processor.py index 4241afcf..e6d75bda 100644 --- a/apps/common/handle/impl/mineru/image_processor.py +++ b/apps/common/handle/impl/mineru/image_processor.py @@ -543,9 +543,6 @@ class MinerUImageProcessor: # Build context-aware prompt with language prompt = self._build_context_aware_prompt(context, language_code) - # Log the final prompt for debugging - self.logger.info(f"mineru-image: Final prompt for {os.path.basename(image_filepath)}:\n{prompt[:1000]}...") - messages = [ {'role': 'system', 'content': prompt}, {'role': 'user', 'content': [ @@ -680,9 +677,60 @@ class MinerUImageProcessor: json_start = response_content.find('```json') + 7 json_end = response_content.find('```', json_start) response_content = response_content[json_start:json_end].strip() - - # Parse JSON - result_json = json.loads(response_content) + + # Parse JSON with proper handling of multiline strings + import re + import ast + + try: + # First try direct parsing (handles properly formatted JSON) + result_json = json.loads(response_content) + except json.JSONDecodeError as e: + # If parsing fails, try to fix unescaped newlines in string values + self.logger.debug(f"mineru-image: Initial JSON parse failed, attempting recovery: {str(e)}") + + # Extract JSON from markdown if present + if '```json' in response_content and '```' in response_content: + json_start = response_content.find('```json') + 7 + json_end = response_content.find('```', json_start) + json_str = response_content[json_start:json_end].strip() + else: + # Extract JSON between first { and last } + start_idx = response_content.find('{') + end_idx = response_content.rfind('}') + if start_idx != -1 and end_idx != -1 and end_idx > start_idx: + json_str = response_content[start_idx:end_idx + 1] + else: + json_str = response_content + + # Method 1: Try using ast.literal_eval which is more forgiving + try: + # Replace single quotes with double quotes for JSON compatibility + json_str_fixed = json_str.replace("'", '"') + result_dict = ast.literal_eval(json_str_fixed) + result_json = dict(result_dict) # Convert to regular dict + except: + # Method 2: Manually fix newlines in string values + # Use regex to find string values and escape newlines within them + def escape_newlines_in_strings(match): + string_content = match.group(1) + # Escape newlines and other special chars within the string + string_content = string_content.replace('\\', '\\\\') # Escape backslashes first + string_content = string_content.replace('"', '\\"') # Escape quotes + string_content = string_content.replace('\n', '\\n') # Escape newlines + string_content = string_content.replace('\r', '\\r') # Escape carriage returns + string_content = string_content.replace('\t', '\\t') # Escape tabs + return f'"{string_content}"' + + # Pattern to match string values in JSON + pattern = r'"([^"\\]*(?:\\.[^"\\]*)*)"' + json_str = re.sub(pattern, escape_newlines_in_strings, json_str) + + # Remove any remaining invalid control characters + json_str = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', json_str) + + # Try parsing the fixed JSON + result_json = json.loads(json_str) # Log the raw classification response for debugging self.logger.info(f"mineru-image: parsed JSON response: {result_json}") @@ -1080,4 +1128,4 @@ class MinerUImageProcessor: image_refs[placeholder] = img_markdown - return image_refs \ No newline at end of file + return image_refs diff --git a/apps/common/handle/impl/mineru/maxkb_adapter/adapter.py b/apps/common/handle/impl/mineru/maxkb_adapter/adapter.py index 40e52fa7..36e8096e 100644 --- a/apps/common/handle/impl/mineru/maxkb_adapter/adapter.py +++ b/apps/common/handle/impl/mineru/maxkb_adapter/adapter.py @@ -300,7 +300,7 @@ class MinerUAdapter: thread = threading.Thread(target=run_async) thread.start() - thread.join(timeout=300) # 5分钟超时 + thread.join(timeout=int(os.environ.get('MINERU_PROCESSING_TIMEOUT', 7200))) # 2h超时 if exception: raise exception diff --git a/apps/common/handle/impl/mineru/parallel_processor.py b/apps/common/handle/impl/mineru/parallel_processor.py index 425de7e2..cbce9399 100644 --- a/apps/common/handle/impl/mineru/parallel_processor.py +++ b/apps/common/handle/impl/mineru/parallel_processor.py @@ -592,7 +592,6 @@ class ParallelMinerUProcessor: img_filepath, img_filename, xref ) images_to_process.append(image_info) - self.logger.info(f"Recognizer: loaded image info: {image_info}") else: self.logger.warning(f"Recognizer: image file not found: {img_filepath}") @@ -1521,4 +1520,4 @@ class ParallelMinerUProcessor: # Import required modules -import os \ No newline at end of file +import os diff --git a/installer/Dockerfile b/installer/Dockerfile index eb866118..150dbb85 100644 --- a/installer/Dockerfile +++ b/installer/Dockerfile @@ -14,13 +14,13 @@ RUN cd ui && ls -la && if [ -d "dist" ]; then exit 0; fi && \ FROM ghcr.io/1panel-dev/maxkb-base:python3.11-pg17.6 AS stage-build # Configure apt to use Aliyun mirror (for Debian 12 bookworm) -RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \ - sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources && \ - sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \ - else \ - sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \ - sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \ - fi +#RUN if [ -f /etc/apt/sources.list.d/debian.sources ]; then \ +# sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources && \ +# sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources; \ +# else \ +# sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \ +# sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list; \ +# fi ARG DEPENDENCIES=" \ python3-pip \ diff --git a/installer/docker-compose.yml b/installer/docker-compose.yml index 7829130f..1941dd05 100644 --- a/installer/docker-compose.yml +++ b/installer/docker-compose.yml @@ -1,38 +1,39 @@ -version: '3.8' +version: "3.8" services: maxkb-dev: - image: 192.168.101.129:2009/maxkb-dev:v1 + image: maxkb-local # 如果需要本地构建,取消下面两行注释 - #build: - # context: ../ - # dockerfile: installer/Dockerfile + # build: + # context: ../ + # dockerfile: installer/Dockerfile container_name: maxkb-dev ports: - "2008:8080" volumes: # 持久化数据 - ~/.maxkb:/opt/maxkb/ + - ../apps:/opt/maxkb-app/apps - /tmp/maxkb:/tmp environment: # 开启调试模式 PYTHONUNBUFFERED: "1" MAXKB_LOG_LEVEL: "DEBUG" - + # MinerU 配置 MAXKB_BASE_URL: http://xbase.aitravelmaster.com MINERU_PARSER_CACHE: "True" MINERU_MULTIMODAL_REFINEMENT: "True" # MinerU 图片存储路径 MAXKB_STORAGE_PATH: "/opt/maxkb/storage" - - #MINERU_API_TYPE: "self_hosted" - #MINERU_API_URL: "http://mineru:8000" - - MINERU_API_TYPE: "cloud" - MINERU_API_URL: "https://mineru.net" - MINERU_API_KEY: "eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiI2OTYwMDEwNiIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc1NTE2MzQ5OCwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiIiwib3BlbklkIjpudWxsLCJ1dWlkIjoiYTQwODk5NjMtNDI1OS00MWM3LWE3NWItY2IzZTQ4NTRjYWIwIiwiZW1haWwiOiIiLCJleHAiOjE3NTYzNzMwOTh9.88m9JSKQhkwJ557jCTFOgmdjeAfpXzxy2QDINaJ0rfHfnMNBxQt47aHr2jABeuxW-fXm8S5AO7zWWTXEGx8BxA" - + + MINERU_API_TYPE: "self_hosted" + MINERU_API_URL: "http://192.168.102.5:9987" + + # MINERU_API_TYPE: "cloud" + # MINERU_API_URL: "https://mineru.net" + # MINERU_API_KEY: "eyJ0eXBlIjoiSldUIiwiYWxnIjoiSFM1MTIifQ.eyJqdGkiOiI2OTYwMDEwNiIsInJvbCI6IlJPTEVfUkVHSVNURVIiLCJpc3MiOiJPcGVuWExhYiIsImlhdCI6MTc1NTE2MzQ5OCwiY2xpZW50SWQiOiJsa3pkeDU3bnZ5MjJqa3BxOXgydyIsInBob25lIjoiIiwib3BlbklkIjpudWxsLCJ1dWlkIjoiYTQwODk5NjMtNDI1OS00MWM3LWE3NWItY2IzZTQ4NTRjYWIwIiwiZW1haWwiOiIiLCJleHAiOjE3NTYzNzMwOTh9.88m9JSKQhkwJ557jCTFOgmdjeAfpXzxy2QDINaJ0rfHfnMNBxQt47aHr2jABeuxW-fXm8S5AO7zWWTXEGx8BxA" + # 配置队列大小 MINERU_QUEUE_SIZE: "50" # 配置处理超时时间(秒) @@ -42,7 +43,7 @@ services: MINERU_REFINER_THREADS: "3" MINERU_RECOGNIZER_THREADS: "3" MINERU_UPLOADER_THREADS: "1" - + MINERU_BATCH_PROCESSING: "true" MINERU_BATCH_SIZE: "10" # 启用/禁用过滤 @@ -59,8 +60,6 @@ services: MINERU_FILTER_MEANINGLESS: "true" # 使用默认的启动命令,让容器内的所有服务正常启动 # command 留空使用镜像默认的 entrypoint - - # 如果使用绑定挂载,可以删除这个 volumes 部分 # volumes: # maxkb_data: diff --git a/pyproject.toml b/pyproject.toml index b2d0cbf9..ad6610d6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,8 +49,10 @@ dependencies = [ "python-docx==1.2.0", "xlrd==2.0.2", "xlwt==1.3.0", - "pymupdf==1.26.3", "pypdf==6.0.0", + "python-pptx==1.0.2", + "reportlab==4.2.5", + "pymupdf==1.26.3", "pydub==0.25.1", "pillow==11.0.0", "pdf2image==1.17.0",