catalog-agent/utils/dataset_manager.py

#!/usr/bin/env python3
"""
Dataset management functions for organizing and processing datasets.
"""

import os
import shutil
import json
from typing import Dict, List, Optional
from pathlib import Path

from utils.file_utils import (
    download_file, extract_zip_file, get_file_hash,
    load_processed_files_log, save_processed_files_log,
    remove_file_or_directory
)


async def download_dataset_files(unique_id: str, files: Dict[str, List[str]]) -> Dict[str, List[str]]:
    """Download or copy dataset files and organize them by key into dataset/{key}/document.txt.
    Supports zip file extraction and combines content using '# Page' separators."""
    if not files:
        return {}

    # Set up directories
    project_dir = os.path.join("projects", unique_id)
    files_dir = os.path.join(project_dir, "files")
    dataset_dir = os.path.join(project_dir, "dataset")

    # Create directories if they don't exist
    os.makedirs(files_dir, exist_ok=True)
    os.makedirs(dataset_dir, exist_ok=True)

    processed_files_by_key = {}

    def extract_zip_file_func(zip_path: str, extract_dir: str) -> List[str]:
        """Extract zip file and return list of extracted txt/md files"""
        extracted_files = []
        try:
            import zipfile
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(extract_dir)

            # Find all extracted txt and md files
            for root, dirs, files in os.walk(extract_dir):
                for file in files:
                    if file.lower().endswith(('.txt', '.md')):
                        extracted_files.append(os.path.join(root, file))

            print(f"Extracted {len(extracted_files)} txt/md files from {zip_path}")
            return extracted_files

        except Exception as e:
            print(f"Error extracting zip file {zip_path}: {str(e)}")
            return []

    # Process each key and its associated files
    for key, file_list in files.items():
        print(f"Processing key '{key}' with {len(file_list)} files")
        processed_files_by_key[key] = []

        # Create target directory for this key
        target_dir = os.path.join(dataset_dir, key)
        os.makedirs(target_dir, exist_ok=True)

        # Check if files are already processed before doing any work
        document_file = os.path.join(target_dir, "document.txt")
        pagination_file = os.path.join(target_dir, "pagination.txt")
        embeddings_file = os.path.join(target_dir, "document_embeddings.pkl")

        already_processed = (
            os.path.exists(document_file) and
            os.path.exists(pagination_file) and
            os.path.exists(embeddings_file) and
            os.path.getsize(document_file) > 0 and
            os.path.getsize(pagination_file) > 0 and
            os.path.getsize(embeddings_file) > 0
        )

        if already_processed:
            print(f"  Skipping already processed files for {key}")
            processed_files_by_key[key].append(document_file)
            continue  # Skip to next key

        # Read and combine all files for this key
        combined_content = []
        all_processed_files = []

        for file_path in file_list:
            # Check if it's a URL (remote file) or local file
            is_remote = file_path.startswith(('http://', 'https://'))
            filename = file_path.split("/")[-1] if file_path else f"file_{len(all_processed_files)}"

            # Create temporary extraction directory for zip files
            temp_extract_dir = None
            files_to_process = []

            try:
                if is_remote:
                    # Handle remote file
                    temp_file = os.path.join(files_dir, filename)
                    print(f"Downloading {file_path} -> {temp_file}")

                    success = await download_file(file_path, temp_file)
                    if not success:
                        print(f"Failed to download {file_path}")
                        continue

                    # Check if it's a zip file
                    if filename.lower().endswith('.zip'):
                        temp_extract_dir = tempfile.mkdtemp(prefix=f"extract_{key}_")
                        print(f"Extracting zip to temporary directory: {temp_extract_dir}")

                        extracted_files = extract_zip_file_func(temp_file, temp_extract_dir)
                        files_to_process.extend(extracted_files)

                        # Copy the zip file to project files directory
                        zip_dest = os.path.join(files_dir, filename)
                        shutil.copy2(temp_file, zip_dest)
                        print(f"Copied local zip file: {temp_file} -> {zip_dest}")
                    else:
                        files_to_process.append(temp_file)

                else:
                    # Handle local file
                    if not os.path.exists(file_path):
                        print(f"Local file not found: {file_path}")
                        continue

                    if filename.lower().endswith('.zip'):
                        # Copy to project directory first
                        local_zip_path = os.path.join(files_dir, filename)
                        shutil.copy2(file_path, local_zip_path)
                        print(f"Copied local zip file: {file_path} -> {local_zip_path}")

                        # Extract zip file
                        temp_extract_dir = tempfile.mkdtemp(prefix=f"extract_{key}_")
                        print(f"Extracting local zip to temporary directory: {temp_extract_dir}")

                        extracted_files = extract_zip_file_func(local_zip_path, temp_extract_dir)
                        files_to_process.extend(extracted_files)
                    else:
                        # Copy non-zip file directly
                        dest_file = os.path.join(files_dir, filename)
                        shutil.copy2(file_path, dest_file)
                        files_to_process.append(dest_file)
                        print(f"Copied local file: {file_path} -> {dest_file}")

                # Process all files (extracted from zip or single file)
                for process_file_path in files_to_process:
                    try:
                        with open(process_file_path, 'r', encoding='utf-8') as f:
                            content = f.read().strip()

                        if content:
                            # Add file content with page separator
                            base_filename = os.path.basename(process_file_path)
                            combined_content.append(f"# Page {base_filename}")
                            combined_content.append(content)

                    except Exception as e:
                        print(f"Failed to read file content from {process_file_path}: {str(e)}")

            except Exception as e:
                print(f"Error processing file {file_path}: {str(e)}")

            finally:
                # Clean up temporary extraction directory
                if temp_extract_dir and os.path.exists(temp_extract_dir):
                    try:
                        shutil.rmtree(temp_extract_dir)
                        print(f"Cleaned up temporary directory: {temp_extract_dir}")
                    except Exception as e:
                        print(f"Failed to clean up temporary directory {temp_extract_dir}: {str(e)}")

        # Write combined content to dataset/{key}/document.txt
        if combined_content:
            try:
                with open(document_file, 'w', encoding='utf-8') as f:
                    f.write('\n\n'.join(combined_content))
                print(f"Created combined document: {document_file}")

                # Generate pagination and embeddings for the combined document
                try:
                    import sys
                    sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'embedding'))
                    from embedding import split_document_by_pages, embed_document

                    # Generate pagination
                    print(f"  Generating pagination for {key}")
                    pages = split_document_by_pages(str(document_file), str(pagination_file))
                    print(f"    Generated {len(pages)} pages")

                    # Generate embeddings
                    print(f"  Generating embeddings for {key}")
                    local_model_path = "./models/paraphrase-multilingual-MiniLM-L12-v2"
                    if not os.path.exists(local_model_path):
                        local_model_path = None  # Fallback to HuggingFace model

                    # Use paragraph chunking strategy with default settings
                    embedding_data = embed_document(
                        str(document_file),
                        str(embeddings_file),
                        chunking_strategy='paragraph',
                        model_path=local_model_path
                    )

                    if embedding_data:
                        print(f"    Generated embeddings for {len(embedding_data['chunks'])} chunks")
                        # Add to processed files only after successful embedding
                        processed_files_by_key[key].append(document_file)
                    else:
                        print(f"    Failed to generate embeddings")

                except Exception as e:
                    print(f"  Failed to generate pagination/embeddings for {key}: {str(e)}")

            except Exception as e:
                print(f"Failed to write combined document: {str(e)}")

    # Load existing log
    processed_log = load_processed_files_log(unique_id)

    # Update log with newly processed files
    for key, file_list in files.items():
        if key not in processed_log:
            processed_log[key] = {}

        for file_path in file_list:
            filename = os.path.basename(file_path)
            processed_log[key][filename] = {
                "original_path": file_path,
                "processed_at": str(os.path.getmtime(document_file) if os.path.exists(document_file) else 0),
                "status": "processed" if key in processed_files_by_key and processed_files_by_key[key] else "failed"
            }

    # Save the updated processed log
    save_processed_files_log(unique_id, processed_log)

    return processed_files_by_key


def generate_dataset_structure(unique_id: str) -> str:
    """Generate a string representation of the dataset structure"""
    dataset_dir = os.path.join("projects", unique_id, "dataset")
    structure = []

    def add_directory_contents(dir_path: str, prefix: str = ""):
        try:
            items = sorted(os.listdir(dir_path))
            for i, item in enumerate(items):
                item_path = os.path.join(dir_path, item)
                is_last = i == len(items) - 1
                current_prefix = "└── " if is_last else "├── "
                structure.append(f"{prefix}{current_prefix}{item}")

                if os.path.isdir(item_path):
                    next_prefix = prefix + ("    " if is_last else "│   ")
                    add_directory_contents(item_path, next_prefix)
        except Exception as e:
            structure.append(f"{prefix}└── Error: {str(e)}")

    if os.path.exists(dataset_dir):
        structure.append(f"dataset/")
        add_directory_contents(dataset_dir, "")
    else:
        structure.append("dataset/ (not found)")

    return "\n".join(structure)


def remove_dataset_directory(unique_id: str, filename_without_ext: str):
    """Remove a specific dataset directory"""
    dataset_path = os.path.join("projects", unique_id, "dataset", filename_without_ext)
    remove_file_or_directory(dataset_path)


def remove_dataset_directory_by_key(unique_id: str, key: str):
    """Remove dataset directory by key"""
    dataset_path = os.path.join("projects", unique_id, "dataset", key)
    remove_file_or_directory(dataset_path)