qwen_agent/task_queue/integration_tasks.py

#!/usr/bin/env python3
"""
Queue tasks for file processing integration.
"""

import os
import json
import time
import hashlib
import shutil
from typing import Dict, List, Optional, Any

from task_queue.config import huey
from task_queue.manager import queue_manager
from task_queue.task_status import task_status_store
from utils import download_dataset_files, save_processed_files_log, load_processed_files_log
from utils.dataset_manager import remove_dataset_directory_by_key


def scan_upload_folder(upload_dir: str) -> List[str]:
    """
    Scan all supported file formats in the upload folder.

    Args:
        upload_dir: Upload folder path

    Returns:
        List[str]: List of supported file paths
    """
    supported_extensions = {
        # Text files
        '.txt', '.md', '.rtf',
        # Document files
        '.doc', '.docx', '.pdf', '.odt',
        # Spreadsheet files
        '.xls', '.xlsx', '.csv', '.ods',
        # Presentation files
        '.ppt', '.pptx', '.odp',
        # E-books
        '.epub', '.mobi',
        # Web files
        '.html', '.htm',
        # Config files
        '.json', '.xml', '.yaml', '.yml',
        # Code files
        '.py', '.js', '.java', '.cpp', '.c', '.go', '.rs',
        # Archive files
        '.zip', '.rar', '.7z', '.tar', '.gz'
    }

    scanned_files = []

    if not os.path.exists(upload_dir):
        return scanned_files

    for root, dirs, files in os.walk(upload_dir):
        for file in files:
            # Skip hidden files and system files
            if file.startswith('.') or file.startswith('~'):
                continue

            file_path = os.path.join(root, file)
            file_extension = os.path.splitext(file)[1].lower()

            # Check if file extension is supported
            if file_extension in supported_extensions:
                scanned_files.append(file_path)
            else:
                # For files without extension, try to process them (may be text files)
                if not file_extension:
                    try:
                        # Try reading the file header to determine if it's a text file
                        with open(file_path, 'r', encoding='utf-8') as f:
                            f.read(1024)  # Read the first 1KB
                        scanned_files.append(file_path)
                    except (UnicodeDecodeError, PermissionError):
                        # Not a text file or unreadable, skip
                        pass

    return scanned_files


@huey.task()
def process_files_async(
    dataset_id: str,
    files: Optional[Dict[str, List[str]]] = None,
    upload_folder: Optional[Dict[str, str]] = None,
    task_id: Optional[str] = None
) -> Dict[str, Any]:
    """
    Asynchronously process file tasks - compatible with existing files/process API.

    Args:
        dataset_id: Unique project ID
        files: Dictionary of file paths grouped by key
        upload_folder: Upload folder dictionary organized by group name, e.g. {'group1': 'my_project1', 'group2': 'my_project2'}
        task_id: Task ID (for status tracking)

    Returns:
        Processing result dictionary
    """
    try:
        print(f"Starting async file processing task, project ID: {dataset_id}")

        # If task_id is provided, set initial status
        if task_id:
            task_status_store.set_status(
                task_id=task_id,
                unique_id=dataset_id,
                status="running"
            )

        # Ensure project directory exists
        project_dir = os.path.join("projects", "data", dataset_id)
        if not os.path.exists(project_dir):
            os.makedirs(project_dir, exist_ok=True)

        # Process files: use key-grouped format
        processed_files_by_key = {}

        # If upload_folder is provided, scan files in those folders
        if upload_folder and not files:
            scanned_files_by_group = {}
            total_scanned_files = 0

            for group_name, folder_name in upload_folder.items():
                # Security check: prevent path traversal attacks
                safe_folder_name = os.path.basename(folder_name)
                upload_dir = os.path.join("projects", "uploads", safe_folder_name)

                if os.path.exists(upload_dir):
                    scanned_files = scan_upload_folder(upload_dir)
                    if scanned_files:
                        scanned_files_by_group[group_name] = scanned_files
                        total_scanned_files += len(scanned_files)
                        print(f"Scanned {len(scanned_files)} files from upload folder '{safe_folder_name}' (group: {group_name})")
                    else:
                        print(f"No supported files found in upload folder '{safe_folder_name}' (group: {group_name})")
                else:
                    print(f"Upload folder does not exist: {upload_dir} (group: {group_name})")

            if scanned_files_by_group:
                files = scanned_files_by_group
                print(f"Total scanned {total_scanned_files} files from {len(scanned_files_by_group)} groups")
            else:
                print("No supported files found in any upload folder")

        if files:
            # Use files from the request (grouped by key)
            # Since this is an async task, call synchronously
            import asyncio
            try:
                loop = asyncio.get_event_loop()
            except RuntimeError:
                loop = asyncio.new_event_loop()
                asyncio.set_event_loop(loop)

            processed_files_by_key = loop.run_until_complete(download_dataset_files(dataset_id, files))
            total_files = sum(len(files_list) for files_list in processed_files_by_key.values())
            print(f"Async processed {total_files} dataset files across {len(processed_files_by_key)} keys, project ID: {dataset_id}")
        else:
            print(f"No files provided in request, project ID: {dataset_id}")

        # Collect all document.txt files in the project directory
        document_files = []
        for root, dirs, files_list in os.walk(project_dir):
            for file in files_list:
                if file == "document.txt":
                    document_files.append(os.path.join(root, file))

        # Generate project README.md file
        try:
            from utils.project_manager import save_project_readme
            save_project_readme(dataset_id)
            print(f"README.md generated, project ID: {dataset_id}")
        except Exception as e:
            print(f"Failed to generate README.md, project ID: {dataset_id}, error: {str(e)}")
            # Does not affect main processing flow, continue

        # Build result file list
        result_files = []
        for key in processed_files_by_key.keys():
            # Add corresponding dataset document.txt path
            document_path = os.path.join("projects", "data", dataset_id, "datasets", key, "document.txt")
            if os.path.exists(document_path):
                result_files.append(document_path)

        # Also add document.txt files that exist but are not in processed_files_by_key
        existing_document_paths = set(result_files)  # Avoid duplicates
        for doc_file in document_files:
            if doc_file not in existing_document_paths:
                result_files.append(doc_file)

        result = {
            "status": "success",
            "message": f"Successfully processed {len(result_files)} document files across {len(processed_files_by_key)} keys",
            "dataset_id": dataset_id,
            "processed_files": result_files,
            "processed_files_by_key": processed_files_by_key,
            "document_files": document_files,
            "total_files_processed": sum(len(files_list) for files_list in processed_files_by_key.values()),
            "processing_time": time.time()
        }

        # Update task status to completed
        if task_id:
            task_status_store.update_status(
                task_id=task_id,
                status="completed",
                result=result
            )

        print(f"Async file processing task completed: {dataset_id}")
        return result

    except Exception as e:
        error_msg = f"Error during async file processing: {str(e)}"
        print(error_msg)

        # Update task status to error
        if task_id:
            task_status_store.update_status(
                task_id=task_id,
                status="failed",
                error=error_msg
            )

        return {
            "status": "error",
            "message": error_msg,
            "dataset_id": dataset_id,
            "error": str(e)
        }


@huey.task()
def process_files_incremental_async(
    dataset_id: str,
    files_to_add: Optional[Dict[str, List[str]]] = None,
    files_to_remove: Optional[Dict[str, List[str]]] = None,
    system_prompt: Optional[str] = None,
    mcp_settings: Optional[List[Dict]] = None,
    task_id: Optional[str] = None
) -> Dict[str, Any]:
    """
    Incremental file processing task - supports adding and removing files.

    Args:
        dataset_id: Unique project ID
        files_to_add: Dictionary of file paths to add, grouped by key
        files_to_remove: Dictionary of file paths to remove, grouped by key
        system_prompt: System prompt
        mcp_settings: MCP settings
        task_id: Task ID (for status tracking)

    Returns:
        Processing result dictionary
    """
    try:
        print(f"Starting incremental file processing task, project ID: {dataset_id}")

        # If task_id is provided, set initial status
        if task_id:
            task_status_store.set_status(
                task_id=task_id,
                unique_id=dataset_id,
                status="running"
            )

        # Ensure project directory exists
        project_dir = os.path.join("projects", "data", dataset_id)
        if not os.path.exists(project_dir):
            os.makedirs(project_dir, exist_ok=True)

        # Load existing processing log
        processed_log = load_processed_files_log(dataset_id)
        print(f"Loaded existing processing log with {len(processed_log)} file records")

        removed_files = []
        added_files = []

        # 1. Process removals
        if files_to_remove:
            print(f"Starting removal processing across {len(files_to_remove)} key groups")
            for key, file_list in files_to_remove.items():
                if not file_list:  # If file list is empty, remove the entire key group
                    print(f"Removing entire key group: {key}")
                    if remove_dataset_directory_by_key(dataset_id, key):
                        removed_files.append(f"dataset/{key}")

                    # Remove all records for this key from the processing log
                    keys_to_remove = [file_hash for file_hash, file_info in processed_log.items()
                                    if file_info.get('key') == key]
                    for file_hash in keys_to_remove:
                        del processed_log[file_hash]
                        removed_files.append(f"log_entry:{file_hash}")
                else:
                    # Remove specific files
                    for file_path in file_list:
                        print(f"Removing specific file: {key}/{file_path}")

                        # Actually delete the file
                        filename = os.path.basename(file_path)

                        # Delete original file
                        source_file = os.path.join("projects", "data", dataset_id, "files", key, filename)
                        if os.path.exists(source_file):
                            os.remove(source_file)
                            removed_files.append(f"file:{key}/{filename}")

                        # Delete processed file directory
                        processed_dir = os.path.join("projects", "data", dataset_id, "processed", key, filename)
                        if os.path.exists(processed_dir):
                            shutil.rmtree(processed_dir)
                            removed_files.append(f"processed:{key}/{filename}")

                        # Compute file hash to find in log
                        file_hash = hashlib.md5(file_path.encode('utf-8')).hexdigest()

                        # Remove from processing log
                        if file_hash in processed_log:
                            del processed_log[file_hash]
                            removed_files.append(f"log_entry:{file_hash}")

        # 2. Process additions
        processed_files_by_key = {}
        if files_to_add:
            print(f"Starting addition processing across {len(files_to_add)} key groups")
            # Use async processing to download files
            import asyncio
            try:
                loop = asyncio.get_event_loop()
            except RuntimeError:
                loop = asyncio.new_event_loop()
                asyncio.set_event_loop(loop)

            processed_files_by_key = loop.run_until_complete(download_dataset_files(dataset_id, files_to_add, incremental_mode=True))
            total_added_files = sum(len(files_list) for files_list in processed_files_by_key.values())
            print(f"Async processed {total_added_files} dataset files across {len(processed_files_by_key)} keys, project ID: {dataset_id}")

            # Record added files
            for key, files_list in processed_files_by_key.items():
                for file_path in files_list:
                    added_files.append(f"{key}/{file_path}")
        else:
            print(f"No files to add provided in request, project ID: {dataset_id}")

        # Save updated processing log
        save_processed_files_log(dataset_id, processed_log)
        print(f"Updated processing log, now contains {len(processed_log)} file records")

        # Save system_prompt and mcp_settings to project directory (if provided)
        if system_prompt:
            system_prompt_file = os.path.join(project_dir, "system_prompt.md")
            with open(system_prompt_file, 'w', encoding='utf-8') as f:
                f.write(system_prompt)
            print(f"Saved system_prompt, project ID: {dataset_id}")

        if mcp_settings:
            mcp_settings_file = os.path.join(project_dir, "mcp_settings.json")
            with open(mcp_settings_file, 'w', encoding='utf-8') as f:
                json.dump(mcp_settings, f, ensure_ascii=False, indent=2)
            print(f"Saved mcp_settings, project ID: {dataset_id}")

        # Generate project README.md file
        try:
            from utils.project_manager import save_project_readme
            save_project_readme(dataset_id)
            print(f"README.md generated, project ID: {dataset_id}")
        except Exception as e:
            print(f"Failed to generate README.md, project ID: {dataset_id}, error: {str(e)}")
            # Does not affect main processing flow, continue

        # Collect all document.txt files in the project directory
        document_files = []
        for root, dirs, files_list in os.walk(project_dir):
            for file in files_list:
                if file == "document.txt":
                    document_files.append(os.path.join(root, file))

        # Build result file list
        result_files = []
        for key in processed_files_by_key.keys():
            # Add corresponding dataset document.txt path
            document_path = os.path.join("projects", "data", dataset_id, "datasets", key, "document.txt")
            if os.path.exists(document_path):
                result_files.append(document_path)

        # Also add document.txt files that exist but are not in processed_files_by_key
        existing_document_paths = set(result_files)  # Avoid duplicates
        for doc_file in document_files:
            if doc_file not in existing_document_paths:
                result_files.append(doc_file)

        result = {
            "status": "success",
            "message": f"Incremental processing complete - added {len(added_files)} files, removed {len(removed_files)} files, {len(result_files)} document files remaining",
            "dataset_id": dataset_id,
            "removed_files": removed_files,
            "added_files": added_files,
            "processed_files": result_files,
            "processed_files_by_key": processed_files_by_key,
            "document_files": document_files,
            "total_files_added": sum(len(files_list) for files_list in processed_files_by_key.values()),
            "total_files_removed": len(removed_files),
            "final_files_count": len(result_files),
            "processing_time": time.time()
        }

        # Update task status to completed
        if task_id:
            task_status_store.update_status(
                task_id=task_id,
                status="completed",
                result=result
            )

        print(f"Incremental file processing task completed: {dataset_id}")
        return result

    except Exception as e:
        error_msg = f"Error during incremental file processing: {str(e)}"
        print(error_msg)

        # Update task status to error
        if task_id:
            task_status_store.update_status(
                task_id=task_id,
                status="failed",
                error=error_msg
            )

        return {
            "status": "error",
            "message": error_msg,
            "dataset_id": dataset_id,
            "error": str(e)
        }


@huey.task()
def cleanup_project_async(
    dataset_id: str,
    remove_all: bool = False
) -> Dict[str, Any]:
    """
    Asynchronously clean up project files.

    Args:
        dataset_id: Unique project ID
        remove_all: Whether to remove the entire project directory

    Returns:
        Cleanup result dictionary
    """
    try:
        print(f"Starting async project cleanup, project ID: {dataset_id}")

        project_dir = os.path.join("projects", "data", dataset_id)
        removed_items = []

        if remove_all and os.path.exists(project_dir):
            import shutil
            shutil.rmtree(project_dir)
            removed_items.append(project_dir)
            result = {
                "status": "success",
                "message": f"Deleted entire project directory: {project_dir}",
                "dataset_id": dataset_id,
                "removed_items": removed_items,
                "action": "remove_all"
            }
        else:
            # Only clean processing log
            log_file = os.path.join(project_dir, "processed_files.json")
            if os.path.exists(log_file):
                os.remove(log_file)
                removed_items.append(log_file)

            result = {
                "status": "success",
                "message": f"Cleaned project processing log, project ID: {dataset_id}",
                "dataset_id": dataset_id,
                "removed_items": removed_items,
                "action": "cleanup_logs"
            }

        print(f"Async cleanup task completed: {dataset_id}")
        return result

    except Exception as e:
        error_msg = f"Error during async project cleanup: {str(e)}"
        print(error_msg)
        return {
            "status": "error",
            "message": error_msg,
            "dataset_id": dataset_id,
            "error": str(e)
        }