Convert all Chinese comments, docstrings, logger/print output, HTTPException detail messages, and API response messages to English across the entire codebase. Functional zh/ja localized strings (e.g. prompt templates, timezone display names, date formats) are preserved as-is. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
500 lines
19 KiB
Python
500 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Queue tasks for file processing integration.
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import time
|
|
import hashlib
|
|
import shutil
|
|
from typing import Dict, List, Optional, Any
|
|
|
|
from task_queue.config import huey
|
|
from task_queue.manager import queue_manager
|
|
from task_queue.task_status import task_status_store
|
|
from utils import download_dataset_files, save_processed_files_log, load_processed_files_log
|
|
from utils.dataset_manager import remove_dataset_directory_by_key
|
|
|
|
|
|
def scan_upload_folder(upload_dir: str) -> List[str]:
|
|
"""
|
|
Scan all supported file formats in the upload folder.
|
|
|
|
Args:
|
|
upload_dir: Upload folder path
|
|
|
|
Returns:
|
|
List[str]: List of supported file paths
|
|
"""
|
|
supported_extensions = {
|
|
# Text files
|
|
'.txt', '.md', '.rtf',
|
|
# Document files
|
|
'.doc', '.docx', '.pdf', '.odt',
|
|
# Spreadsheet files
|
|
'.xls', '.xlsx', '.csv', '.ods',
|
|
# Presentation files
|
|
'.ppt', '.pptx', '.odp',
|
|
# E-books
|
|
'.epub', '.mobi',
|
|
# Web files
|
|
'.html', '.htm',
|
|
# Config files
|
|
'.json', '.xml', '.yaml', '.yml',
|
|
# Code files
|
|
'.py', '.js', '.java', '.cpp', '.c', '.go', '.rs',
|
|
# Archive files
|
|
'.zip', '.rar', '.7z', '.tar', '.gz'
|
|
}
|
|
|
|
scanned_files = []
|
|
|
|
if not os.path.exists(upload_dir):
|
|
return scanned_files
|
|
|
|
for root, dirs, files in os.walk(upload_dir):
|
|
for file in files:
|
|
# Skip hidden files and system files
|
|
if file.startswith('.') or file.startswith('~'):
|
|
continue
|
|
|
|
file_path = os.path.join(root, file)
|
|
file_extension = os.path.splitext(file)[1].lower()
|
|
|
|
# Check if file extension is supported
|
|
if file_extension in supported_extensions:
|
|
scanned_files.append(file_path)
|
|
else:
|
|
# For files without extension, try to process them (may be text files)
|
|
if not file_extension:
|
|
try:
|
|
# Try reading the file header to determine if it's a text file
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
f.read(1024) # Read the first 1KB
|
|
scanned_files.append(file_path)
|
|
except (UnicodeDecodeError, PermissionError):
|
|
# Not a text file or unreadable, skip
|
|
pass
|
|
|
|
return scanned_files
|
|
|
|
|
|
@huey.task()
|
|
def process_files_async(
|
|
dataset_id: str,
|
|
files: Optional[Dict[str, List[str]]] = None,
|
|
upload_folder: Optional[Dict[str, str]] = None,
|
|
task_id: Optional[str] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Asynchronously process file tasks - compatible with existing files/process API.
|
|
|
|
Args:
|
|
dataset_id: Unique project ID
|
|
files: Dictionary of file paths grouped by key
|
|
upload_folder: Upload folder dictionary organized by group name, e.g. {'group1': 'my_project1', 'group2': 'my_project2'}
|
|
task_id: Task ID (for status tracking)
|
|
|
|
Returns:
|
|
Processing result dictionary
|
|
"""
|
|
try:
|
|
print(f"Starting async file processing task, project ID: {dataset_id}")
|
|
|
|
# If task_id is provided, set initial status
|
|
if task_id:
|
|
task_status_store.set_status(
|
|
task_id=task_id,
|
|
unique_id=dataset_id,
|
|
status="running"
|
|
)
|
|
|
|
# Ensure project directory exists
|
|
project_dir = os.path.join("projects", "data", dataset_id)
|
|
if not os.path.exists(project_dir):
|
|
os.makedirs(project_dir, exist_ok=True)
|
|
|
|
# Process files: use key-grouped format
|
|
processed_files_by_key = {}
|
|
|
|
# If upload_folder is provided, scan files in those folders
|
|
if upload_folder and not files:
|
|
scanned_files_by_group = {}
|
|
total_scanned_files = 0
|
|
|
|
for group_name, folder_name in upload_folder.items():
|
|
# Security check: prevent path traversal attacks
|
|
safe_folder_name = os.path.basename(folder_name)
|
|
upload_dir = os.path.join("projects", "uploads", safe_folder_name)
|
|
|
|
if os.path.exists(upload_dir):
|
|
scanned_files = scan_upload_folder(upload_dir)
|
|
if scanned_files:
|
|
scanned_files_by_group[group_name] = scanned_files
|
|
total_scanned_files += len(scanned_files)
|
|
print(f"Scanned {len(scanned_files)} files from upload folder '{safe_folder_name}' (group: {group_name})")
|
|
else:
|
|
print(f"No supported files found in upload folder '{safe_folder_name}' (group: {group_name})")
|
|
else:
|
|
print(f"Upload folder does not exist: {upload_dir} (group: {group_name})")
|
|
|
|
if scanned_files_by_group:
|
|
files = scanned_files_by_group
|
|
print(f"Total scanned {total_scanned_files} files from {len(scanned_files_by_group)} groups")
|
|
else:
|
|
print("No supported files found in any upload folder")
|
|
|
|
if files:
|
|
# Use files from the request (grouped by key)
|
|
# Since this is an async task, call synchronously
|
|
import asyncio
|
|
try:
|
|
loop = asyncio.get_event_loop()
|
|
except RuntimeError:
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
|
|
processed_files_by_key = loop.run_until_complete(download_dataset_files(dataset_id, files))
|
|
total_files = sum(len(files_list) for files_list in processed_files_by_key.values())
|
|
print(f"Async processed {total_files} dataset files across {len(processed_files_by_key)} keys, project ID: {dataset_id}")
|
|
else:
|
|
print(f"No files provided in request, project ID: {dataset_id}")
|
|
|
|
# Collect all document.txt files in the project directory
|
|
document_files = []
|
|
for root, dirs, files_list in os.walk(project_dir):
|
|
for file in files_list:
|
|
if file == "document.txt":
|
|
document_files.append(os.path.join(root, file))
|
|
|
|
# Generate project README.md file
|
|
try:
|
|
from utils.project_manager import save_project_readme
|
|
save_project_readme(dataset_id)
|
|
print(f"README.md generated, project ID: {dataset_id}")
|
|
except Exception as e:
|
|
print(f"Failed to generate README.md, project ID: {dataset_id}, error: {str(e)}")
|
|
# Does not affect main processing flow, continue
|
|
|
|
# Build result file list
|
|
result_files = []
|
|
for key in processed_files_by_key.keys():
|
|
# Add corresponding dataset document.txt path
|
|
document_path = os.path.join("projects", "data", dataset_id, "datasets", key, "document.txt")
|
|
if os.path.exists(document_path):
|
|
result_files.append(document_path)
|
|
|
|
# Also add document.txt files that exist but are not in processed_files_by_key
|
|
existing_document_paths = set(result_files) # Avoid duplicates
|
|
for doc_file in document_files:
|
|
if doc_file not in existing_document_paths:
|
|
result_files.append(doc_file)
|
|
|
|
result = {
|
|
"status": "success",
|
|
"message": f"Successfully processed {len(result_files)} document files across {len(processed_files_by_key)} keys",
|
|
"dataset_id": dataset_id,
|
|
"processed_files": result_files,
|
|
"processed_files_by_key": processed_files_by_key,
|
|
"document_files": document_files,
|
|
"total_files_processed": sum(len(files_list) for files_list in processed_files_by_key.values()),
|
|
"processing_time": time.time()
|
|
}
|
|
|
|
# Update task status to completed
|
|
if task_id:
|
|
task_status_store.update_status(
|
|
task_id=task_id,
|
|
status="completed",
|
|
result=result
|
|
)
|
|
|
|
print(f"Async file processing task completed: {dataset_id}")
|
|
return result
|
|
|
|
except Exception as e:
|
|
error_msg = f"Error during async file processing: {str(e)}"
|
|
print(error_msg)
|
|
|
|
# Update task status to error
|
|
if task_id:
|
|
task_status_store.update_status(
|
|
task_id=task_id,
|
|
status="failed",
|
|
error=error_msg
|
|
)
|
|
|
|
return {
|
|
"status": "error",
|
|
"message": error_msg,
|
|
"dataset_id": dataset_id,
|
|
"error": str(e)
|
|
}
|
|
|
|
|
|
@huey.task()
|
|
def process_files_incremental_async(
|
|
dataset_id: str,
|
|
files_to_add: Optional[Dict[str, List[str]]] = None,
|
|
files_to_remove: Optional[Dict[str, List[str]]] = None,
|
|
system_prompt: Optional[str] = None,
|
|
mcp_settings: Optional[List[Dict]] = None,
|
|
task_id: Optional[str] = None
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Incremental file processing task - supports adding and removing files.
|
|
|
|
Args:
|
|
dataset_id: Unique project ID
|
|
files_to_add: Dictionary of file paths to add, grouped by key
|
|
files_to_remove: Dictionary of file paths to remove, grouped by key
|
|
system_prompt: System prompt
|
|
mcp_settings: MCP settings
|
|
task_id: Task ID (for status tracking)
|
|
|
|
Returns:
|
|
Processing result dictionary
|
|
"""
|
|
try:
|
|
print(f"Starting incremental file processing task, project ID: {dataset_id}")
|
|
|
|
# If task_id is provided, set initial status
|
|
if task_id:
|
|
task_status_store.set_status(
|
|
task_id=task_id,
|
|
unique_id=dataset_id,
|
|
status="running"
|
|
)
|
|
|
|
# Ensure project directory exists
|
|
project_dir = os.path.join("projects", "data", dataset_id)
|
|
if not os.path.exists(project_dir):
|
|
os.makedirs(project_dir, exist_ok=True)
|
|
|
|
# Load existing processing log
|
|
processed_log = load_processed_files_log(dataset_id)
|
|
print(f"Loaded existing processing log with {len(processed_log)} file records")
|
|
|
|
removed_files = []
|
|
added_files = []
|
|
|
|
# 1. Process removals
|
|
if files_to_remove:
|
|
print(f"Starting removal processing across {len(files_to_remove)} key groups")
|
|
for key, file_list in files_to_remove.items():
|
|
if not file_list: # If file list is empty, remove the entire key group
|
|
print(f"Removing entire key group: {key}")
|
|
if remove_dataset_directory_by_key(dataset_id, key):
|
|
removed_files.append(f"dataset/{key}")
|
|
|
|
# Remove all records for this key from the processing log
|
|
keys_to_remove = [file_hash for file_hash, file_info in processed_log.items()
|
|
if file_info.get('key') == key]
|
|
for file_hash in keys_to_remove:
|
|
del processed_log[file_hash]
|
|
removed_files.append(f"log_entry:{file_hash}")
|
|
else:
|
|
# Remove specific files
|
|
for file_path in file_list:
|
|
print(f"Removing specific file: {key}/{file_path}")
|
|
|
|
# Actually delete the file
|
|
filename = os.path.basename(file_path)
|
|
|
|
# Delete original file
|
|
source_file = os.path.join("projects", "data", dataset_id, "files", key, filename)
|
|
if os.path.exists(source_file):
|
|
os.remove(source_file)
|
|
removed_files.append(f"file:{key}/{filename}")
|
|
|
|
# Delete processed file directory
|
|
processed_dir = os.path.join("projects", "data", dataset_id, "processed", key, filename)
|
|
if os.path.exists(processed_dir):
|
|
shutil.rmtree(processed_dir)
|
|
removed_files.append(f"processed:{key}/{filename}")
|
|
|
|
# Compute file hash to find in log
|
|
file_hash = hashlib.md5(file_path.encode('utf-8')).hexdigest()
|
|
|
|
# Remove from processing log
|
|
if file_hash in processed_log:
|
|
del processed_log[file_hash]
|
|
removed_files.append(f"log_entry:{file_hash}")
|
|
|
|
# 2. Process additions
|
|
processed_files_by_key = {}
|
|
if files_to_add:
|
|
print(f"Starting addition processing across {len(files_to_add)} key groups")
|
|
# Use async processing to download files
|
|
import asyncio
|
|
try:
|
|
loop = asyncio.get_event_loop()
|
|
except RuntimeError:
|
|
loop = asyncio.new_event_loop()
|
|
asyncio.set_event_loop(loop)
|
|
|
|
processed_files_by_key = loop.run_until_complete(download_dataset_files(dataset_id, files_to_add, incremental_mode=True))
|
|
total_added_files = sum(len(files_list) for files_list in processed_files_by_key.values())
|
|
print(f"Async processed {total_added_files} dataset files across {len(processed_files_by_key)} keys, project ID: {dataset_id}")
|
|
|
|
# Record added files
|
|
for key, files_list in processed_files_by_key.items():
|
|
for file_path in files_list:
|
|
added_files.append(f"{key}/{file_path}")
|
|
else:
|
|
print(f"No files to add provided in request, project ID: {dataset_id}")
|
|
|
|
# Save updated processing log
|
|
save_processed_files_log(dataset_id, processed_log)
|
|
print(f"Updated processing log, now contains {len(processed_log)} file records")
|
|
|
|
# Save system_prompt and mcp_settings to project directory (if provided)
|
|
if system_prompt:
|
|
system_prompt_file = os.path.join(project_dir, "system_prompt.md")
|
|
with open(system_prompt_file, 'w', encoding='utf-8') as f:
|
|
f.write(system_prompt)
|
|
print(f"Saved system_prompt, project ID: {dataset_id}")
|
|
|
|
if mcp_settings:
|
|
mcp_settings_file = os.path.join(project_dir, "mcp_settings.json")
|
|
with open(mcp_settings_file, 'w', encoding='utf-8') as f:
|
|
json.dump(mcp_settings, f, ensure_ascii=False, indent=2)
|
|
print(f"Saved mcp_settings, project ID: {dataset_id}")
|
|
|
|
# Generate project README.md file
|
|
try:
|
|
from utils.project_manager import save_project_readme
|
|
save_project_readme(dataset_id)
|
|
print(f"README.md generated, project ID: {dataset_id}")
|
|
except Exception as e:
|
|
print(f"Failed to generate README.md, project ID: {dataset_id}, error: {str(e)}")
|
|
# Does not affect main processing flow, continue
|
|
|
|
# Collect all document.txt files in the project directory
|
|
document_files = []
|
|
for root, dirs, files_list in os.walk(project_dir):
|
|
for file in files_list:
|
|
if file == "document.txt":
|
|
document_files.append(os.path.join(root, file))
|
|
|
|
# Build result file list
|
|
result_files = []
|
|
for key in processed_files_by_key.keys():
|
|
# Add corresponding dataset document.txt path
|
|
document_path = os.path.join("projects", "data", dataset_id, "datasets", key, "document.txt")
|
|
if os.path.exists(document_path):
|
|
result_files.append(document_path)
|
|
|
|
# Also add document.txt files that exist but are not in processed_files_by_key
|
|
existing_document_paths = set(result_files) # Avoid duplicates
|
|
for doc_file in document_files:
|
|
if doc_file not in existing_document_paths:
|
|
result_files.append(doc_file)
|
|
|
|
result = {
|
|
"status": "success",
|
|
"message": f"Incremental processing complete - added {len(added_files)} files, removed {len(removed_files)} files, {len(result_files)} document files remaining",
|
|
"dataset_id": dataset_id,
|
|
"removed_files": removed_files,
|
|
"added_files": added_files,
|
|
"processed_files": result_files,
|
|
"processed_files_by_key": processed_files_by_key,
|
|
"document_files": document_files,
|
|
"total_files_added": sum(len(files_list) for files_list in processed_files_by_key.values()),
|
|
"total_files_removed": len(removed_files),
|
|
"final_files_count": len(result_files),
|
|
"processing_time": time.time()
|
|
}
|
|
|
|
# Update task status to completed
|
|
if task_id:
|
|
task_status_store.update_status(
|
|
task_id=task_id,
|
|
status="completed",
|
|
result=result
|
|
)
|
|
|
|
print(f"Incremental file processing task completed: {dataset_id}")
|
|
return result
|
|
|
|
except Exception as e:
|
|
error_msg = f"Error during incremental file processing: {str(e)}"
|
|
print(error_msg)
|
|
|
|
# Update task status to error
|
|
if task_id:
|
|
task_status_store.update_status(
|
|
task_id=task_id,
|
|
status="failed",
|
|
error=error_msg
|
|
)
|
|
|
|
return {
|
|
"status": "error",
|
|
"message": error_msg,
|
|
"dataset_id": dataset_id,
|
|
"error": str(e)
|
|
}
|
|
|
|
|
|
@huey.task()
|
|
def cleanup_project_async(
|
|
dataset_id: str,
|
|
remove_all: bool = False
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Asynchronously clean up project files.
|
|
|
|
Args:
|
|
dataset_id: Unique project ID
|
|
remove_all: Whether to remove the entire project directory
|
|
|
|
Returns:
|
|
Cleanup result dictionary
|
|
"""
|
|
try:
|
|
print(f"Starting async project cleanup, project ID: {dataset_id}")
|
|
|
|
project_dir = os.path.join("projects", "data", dataset_id)
|
|
removed_items = []
|
|
|
|
if remove_all and os.path.exists(project_dir):
|
|
import shutil
|
|
shutil.rmtree(project_dir)
|
|
removed_items.append(project_dir)
|
|
result = {
|
|
"status": "success",
|
|
"message": f"Deleted entire project directory: {project_dir}",
|
|
"dataset_id": dataset_id,
|
|
"removed_items": removed_items,
|
|
"action": "remove_all"
|
|
}
|
|
else:
|
|
# Only clean processing log
|
|
log_file = os.path.join(project_dir, "processed_files.json")
|
|
if os.path.exists(log_file):
|
|
os.remove(log_file)
|
|
removed_items.append(log_file)
|
|
|
|
result = {
|
|
"status": "success",
|
|
"message": f"Cleaned project processing log, project ID: {dataset_id}",
|
|
"dataset_id": dataset_id,
|
|
"removed_items": removed_items,
|
|
"action": "cleanup_logs"
|
|
}
|
|
|
|
print(f"Async cleanup task completed: {dataset_id}")
|
|
return result
|
|
|
|
except Exception as e:
|
|
error_msg = f"Error during async project cleanup: {str(e)}"
|
|
print(error_msg)
|
|
return {
|
|
"status": "error",
|
|
"message": error_msg,
|
|
"dataset_id": dataset_id,
|
|
"error": str(e)
|
|
}
|