#!/usr/bin/env python3 """ Queue tasks for file processing integration. """ import os import json import time import hashlib import shutil from typing import Dict, List, Optional, Any from task_queue.config import huey from task_queue.manager import queue_manager from task_queue.task_status import task_status_store from utils import download_dataset_files, save_processed_files_log, load_processed_files_log from utils.dataset_manager import remove_dataset_directory_by_key def scan_upload_folder(upload_dir: str) -> List[str]: """ Scan all supported file formats in the upload folder. Args: upload_dir: Upload folder path Returns: List[str]: List of supported file paths """ supported_extensions = { # Text files '.txt', '.md', '.rtf', # Document files '.doc', '.docx', '.pdf', '.odt', # Spreadsheet files '.xls', '.xlsx', '.csv', '.ods', # Presentation files '.ppt', '.pptx', '.odp', # E-books '.epub', '.mobi', # Web files '.html', '.htm', # Config files '.json', '.xml', '.yaml', '.yml', # Code files '.py', '.js', '.java', '.cpp', '.c', '.go', '.rs', # Archive files '.zip', '.rar', '.7z', '.tar', '.gz' } scanned_files = [] if not os.path.exists(upload_dir): return scanned_files for root, dirs, files in os.walk(upload_dir): for file in files: # Skip hidden files and system files if file.startswith('.') or file.startswith('~'): continue file_path = os.path.join(root, file) file_extension = os.path.splitext(file)[1].lower() # Check if file extension is supported if file_extension in supported_extensions: scanned_files.append(file_path) else: # For files without extension, try to process them (may be text files) if not file_extension: try: # Try reading the file header to determine if it's a text file with open(file_path, 'r', encoding='utf-8') as f: f.read(1024) # Read the first 1KB scanned_files.append(file_path) except (UnicodeDecodeError, PermissionError): # Not a text file or unreadable, skip pass return scanned_files @huey.task() def process_files_async( dataset_id: str, files: Optional[Dict[str, List[str]]] = None, upload_folder: Optional[Dict[str, str]] = None, task_id: Optional[str] = None ) -> Dict[str, Any]: """ Asynchronously process file tasks - compatible with existing files/process API. Args: dataset_id: Unique project ID files: Dictionary of file paths grouped by key upload_folder: Upload folder dictionary organized by group name, e.g. {'group1': 'my_project1', 'group2': 'my_project2'} task_id: Task ID (for status tracking) Returns: Processing result dictionary """ try: print(f"Starting async file processing task, project ID: {dataset_id}") # If task_id is provided, set initial status if task_id: task_status_store.set_status( task_id=task_id, unique_id=dataset_id, status="running" ) # Ensure project directory exists project_dir = os.path.join("projects", "data", dataset_id) if not os.path.exists(project_dir): os.makedirs(project_dir, exist_ok=True) # Process files: use key-grouped format processed_files_by_key = {} # If upload_folder is provided, scan files in those folders if upload_folder and not files: scanned_files_by_group = {} total_scanned_files = 0 for group_name, folder_name in upload_folder.items(): # Security check: prevent path traversal attacks safe_folder_name = os.path.basename(folder_name) upload_dir = os.path.join("projects", "uploads", safe_folder_name) if os.path.exists(upload_dir): scanned_files = scan_upload_folder(upload_dir) if scanned_files: scanned_files_by_group[group_name] = scanned_files total_scanned_files += len(scanned_files) print(f"Scanned {len(scanned_files)} files from upload folder '{safe_folder_name}' (group: {group_name})") else: print(f"No supported files found in upload folder '{safe_folder_name}' (group: {group_name})") else: print(f"Upload folder does not exist: {upload_dir} (group: {group_name})") if scanned_files_by_group: files = scanned_files_by_group print(f"Total scanned {total_scanned_files} files from {len(scanned_files_by_group)} groups") else: print("No supported files found in any upload folder") if files: # Use files from the request (grouped by key) # Since this is an async task, call synchronously import asyncio try: loop = asyncio.get_event_loop() except RuntimeError: loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) processed_files_by_key = loop.run_until_complete(download_dataset_files(dataset_id, files)) total_files = sum(len(files_list) for files_list in processed_files_by_key.values()) print(f"Async processed {total_files} dataset files across {len(processed_files_by_key)} keys, project ID: {dataset_id}") else: print(f"No files provided in request, project ID: {dataset_id}") # Collect all document.txt files in the project directory document_files = [] for root, dirs, files_list in os.walk(project_dir): for file in files_list: if file == "document.txt": document_files.append(os.path.join(root, file)) # Generate project README.md file try: from utils.project_manager import save_project_readme save_project_readme(dataset_id) print(f"README.md generated, project ID: {dataset_id}") except Exception as e: print(f"Failed to generate README.md, project ID: {dataset_id}, error: {str(e)}") # Does not affect main processing flow, continue # Build result file list result_files = [] for key in processed_files_by_key.keys(): # Add corresponding dataset document.txt path document_path = os.path.join("projects", "data", dataset_id, "datasets", key, "document.txt") if os.path.exists(document_path): result_files.append(document_path) # Also add document.txt files that exist but are not in processed_files_by_key existing_document_paths = set(result_files) # Avoid duplicates for doc_file in document_files: if doc_file not in existing_document_paths: result_files.append(doc_file) result = { "status": "success", "message": f"Successfully processed {len(result_files)} document files across {len(processed_files_by_key)} keys", "dataset_id": dataset_id, "processed_files": result_files, "processed_files_by_key": processed_files_by_key, "document_files": document_files, "total_files_processed": sum(len(files_list) for files_list in processed_files_by_key.values()), "processing_time": time.time() } # Update task status to completed if task_id: task_status_store.update_status( task_id=task_id, status="completed", result=result ) print(f"Async file processing task completed: {dataset_id}") return result except Exception as e: error_msg = f"Error during async file processing: {str(e)}" print(error_msg) # Update task status to error if task_id: task_status_store.update_status( task_id=task_id, status="failed", error=error_msg ) return { "status": "error", "message": error_msg, "dataset_id": dataset_id, "error": str(e) } @huey.task() def process_files_incremental_async( dataset_id: str, files_to_add: Optional[Dict[str, List[str]]] = None, files_to_remove: Optional[Dict[str, List[str]]] = None, system_prompt: Optional[str] = None, mcp_settings: Optional[List[Dict]] = None, task_id: Optional[str] = None ) -> Dict[str, Any]: """ Incremental file processing task - supports adding and removing files. Args: dataset_id: Unique project ID files_to_add: Dictionary of file paths to add, grouped by key files_to_remove: Dictionary of file paths to remove, grouped by key system_prompt: System prompt mcp_settings: MCP settings task_id: Task ID (for status tracking) Returns: Processing result dictionary """ try: print(f"Starting incremental file processing task, project ID: {dataset_id}") # If task_id is provided, set initial status if task_id: task_status_store.set_status( task_id=task_id, unique_id=dataset_id, status="running" ) # Ensure project directory exists project_dir = os.path.join("projects", "data", dataset_id) if not os.path.exists(project_dir): os.makedirs(project_dir, exist_ok=True) # Load existing processing log processed_log = load_processed_files_log(dataset_id) print(f"Loaded existing processing log with {len(processed_log)} file records") removed_files = [] added_files = [] # 1. Process removals if files_to_remove: print(f"Starting removal processing across {len(files_to_remove)} key groups") for key, file_list in files_to_remove.items(): if not file_list: # If file list is empty, remove the entire key group print(f"Removing entire key group: {key}") if remove_dataset_directory_by_key(dataset_id, key): removed_files.append(f"dataset/{key}") # Remove all records for this key from the processing log keys_to_remove = [file_hash for file_hash, file_info in processed_log.items() if file_info.get('key') == key] for file_hash in keys_to_remove: del processed_log[file_hash] removed_files.append(f"log_entry:{file_hash}") else: # Remove specific files for file_path in file_list: print(f"Removing specific file: {key}/{file_path}") # Actually delete the file filename = os.path.basename(file_path) # Delete original file source_file = os.path.join("projects", "data", dataset_id, "files", key, filename) if os.path.exists(source_file): os.remove(source_file) removed_files.append(f"file:{key}/{filename}") # Delete processed file directory processed_dir = os.path.join("projects", "data", dataset_id, "processed", key, filename) if os.path.exists(processed_dir): shutil.rmtree(processed_dir) removed_files.append(f"processed:{key}/{filename}") # Compute file hash to find in log file_hash = hashlib.md5(file_path.encode('utf-8')).hexdigest() # Remove from processing log if file_hash in processed_log: del processed_log[file_hash] removed_files.append(f"log_entry:{file_hash}") # 2. Process additions processed_files_by_key = {} if files_to_add: print(f"Starting addition processing across {len(files_to_add)} key groups") # Use async processing to download files import asyncio try: loop = asyncio.get_event_loop() except RuntimeError: loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) processed_files_by_key = loop.run_until_complete(download_dataset_files(dataset_id, files_to_add, incremental_mode=True)) total_added_files = sum(len(files_list) for files_list in processed_files_by_key.values()) print(f"Async processed {total_added_files} dataset files across {len(processed_files_by_key)} keys, project ID: {dataset_id}") # Record added files for key, files_list in processed_files_by_key.items(): for file_path in files_list: added_files.append(f"{key}/{file_path}") else: print(f"No files to add provided in request, project ID: {dataset_id}") # Save updated processing log save_processed_files_log(dataset_id, processed_log) print(f"Updated processing log, now contains {len(processed_log)} file records") # Save system_prompt and mcp_settings to project directory (if provided) if system_prompt: system_prompt_file = os.path.join(project_dir, "system_prompt.md") with open(system_prompt_file, 'w', encoding='utf-8') as f: f.write(system_prompt) print(f"Saved system_prompt, project ID: {dataset_id}") if mcp_settings: mcp_settings_file = os.path.join(project_dir, "mcp_settings.json") with open(mcp_settings_file, 'w', encoding='utf-8') as f: json.dump(mcp_settings, f, ensure_ascii=False, indent=2) print(f"Saved mcp_settings, project ID: {dataset_id}") # Generate project README.md file try: from utils.project_manager import save_project_readme save_project_readme(dataset_id) print(f"README.md generated, project ID: {dataset_id}") except Exception as e: print(f"Failed to generate README.md, project ID: {dataset_id}, error: {str(e)}") # Does not affect main processing flow, continue # Collect all document.txt files in the project directory document_files = [] for root, dirs, files_list in os.walk(project_dir): for file in files_list: if file == "document.txt": document_files.append(os.path.join(root, file)) # Build result file list result_files = [] for key in processed_files_by_key.keys(): # Add corresponding dataset document.txt path document_path = os.path.join("projects", "data", dataset_id, "datasets", key, "document.txt") if os.path.exists(document_path): result_files.append(document_path) # Also add document.txt files that exist but are not in processed_files_by_key existing_document_paths = set(result_files) # Avoid duplicates for doc_file in document_files: if doc_file not in existing_document_paths: result_files.append(doc_file) result = { "status": "success", "message": f"Incremental processing complete - added {len(added_files)} files, removed {len(removed_files)} files, {len(result_files)} document files remaining", "dataset_id": dataset_id, "removed_files": removed_files, "added_files": added_files, "processed_files": result_files, "processed_files_by_key": processed_files_by_key, "document_files": document_files, "total_files_added": sum(len(files_list) for files_list in processed_files_by_key.values()), "total_files_removed": len(removed_files), "final_files_count": len(result_files), "processing_time": time.time() } # Update task status to completed if task_id: task_status_store.update_status( task_id=task_id, status="completed", result=result ) print(f"Incremental file processing task completed: {dataset_id}") return result except Exception as e: error_msg = f"Error during incremental file processing: {str(e)}" print(error_msg) # Update task status to error if task_id: task_status_store.update_status( task_id=task_id, status="failed", error=error_msg ) return { "status": "error", "message": error_msg, "dataset_id": dataset_id, "error": str(e) } @huey.task() def cleanup_project_async( dataset_id: str, remove_all: bool = False ) -> Dict[str, Any]: """ Asynchronously clean up project files. Args: dataset_id: Unique project ID remove_all: Whether to remove the entire project directory Returns: Cleanup result dictionary """ try: print(f"Starting async project cleanup, project ID: {dataset_id}") project_dir = os.path.join("projects", "data", dataset_id) removed_items = [] if remove_all and os.path.exists(project_dir): import shutil shutil.rmtree(project_dir) removed_items.append(project_dir) result = { "status": "success", "message": f"Deleted entire project directory: {project_dir}", "dataset_id": dataset_id, "removed_items": removed_items, "action": "remove_all" } else: # Only clean processing log log_file = os.path.join(project_dir, "processed_files.json") if os.path.exists(log_file): os.remove(log_file) removed_items.append(log_file) result = { "status": "success", "message": f"Cleaned project processing log, project ID: {dataset_id}", "dataset_id": dataset_id, "removed_items": removed_items, "action": "cleanup_logs" } print(f"Async cleanup task completed: {dataset_id}") return result except Exception as e: error_msg = f"Error during async project cleanup: {str(e)}" print(error_msg) return { "status": "error", "message": error_msg, "dataset_id": dataset_id, "error": str(e) }