#!/usr/bin/env python3
import os
import shutil
import logging
from pathlib import Path

# 配置日志
logger = logging.getLogger('app')

def is_file_already_processed(target_file: Path, pagination_file: Path, embeddings_file: Path) -> bool:
    """Check if a file has already been processed (document.txt, pagination.txt, and embeddings exist)"""
    if not target_file.exists():
        return False
    
    # Check if pagination and embeddings files exist and are not empty
    if pagination_file.exists() and embeddings_file.exists():
        # Check file sizes to ensure they're not empty
        if pagination_file.stat().st_size > 0 and embeddings_file.stat().st_size > 0:
            return True
    
    return False

def organize_single_project_files(unique_id: str, skip_processed=True):
    """Organize files for a single project from projects/data/{unique_id}/files to projects/data/{unique_id}/dataset/{file_name}/document.txt"""
    
    project_dir = Path("projects") / "data" / unique_id
    
    if not project_dir.exists():
        logger.error(f"Project directory not found: {project_dir}")
        return
    
    logger.info(f"Organizing files for project: {unique_id} (skip_processed={skip_processed})")
    
    files_dir = project_dir / "files"
    dataset_dir = project_dir / "dataset"
    
    # Check if files directory exists and has files
    if not files_dir.exists():
        logger.info(f"  No files directory found, skipping...")
        return
        
    files = list(files_dir.glob("*"))
    if not files:
        logger.info(f"  Files directory is empty, skipping...")
        return
        
    # Create dataset directory if it doesn't exist
    dataset_dir.mkdir(exist_ok=True)
    
    # Copy each file to its own directory
    for file_path in files:
        if file_path.is_file():
            # Get filename without extension as directory name
            file_name_without_ext = file_path.stem
            target_dir = dataset_dir / file_name_without_ext
            target_file = target_dir / "document.txt"
            pagination_file = target_dir / "pagination.txt"
            embeddings_file = target_dir / "embedding.pkl"
            
            # Check if file is already processed
            if skip_processed and is_file_already_processed(target_file, pagination_file, embeddings_file):
                logger.info(f"  Skipping already processed file: {file_path.name}")
                continue
            
            logger.info(f"  Copying {file_path.name} -> {target_file.relative_to(project_dir)}")
            
            # Create target directory
            target_dir.mkdir(exist_ok=True)
            
            # Copy and rename file
            shutil.copy2(str(file_path), str(target_file))
            
    print(f"  Files remain in original location (copied to dataset structure)")
    
    # Process each document.txt file: split pages and generate embeddings
    if not skip_processed:
        import sys
        sys.path.append(os.path.join(os.path.dirname(__file__), 'embedding'))
        
        from embedding import split_document_by_pages, embed_document
        
        for file_path in files:
            if file_path.is_file():
                file_name_without_ext = file_path.stem
                target_dir = dataset_dir / file_name_without_ext
                document_file = target_dir / "document.txt"
                pagination_file = target_dir / "pagination.txt"
                embeddings_file = target_dir / "embedding.pkl"
                
                # Skip if already processed
                if is_file_already_processed(document_file, pagination_file, embeddings_file):
                    print(f"  Skipping document processing for already processed file: {file_path.name}")
                    continue
                
                # Split document by pages
                print(f"  Splitting pages for {document_file.name}")
                try:
                    pages = split_document_by_pages(str(document_file), str(pagination_file))
                    print(f"    Generated {len(pages)} pages")
                except Exception as e:
                    print(f"    Failed to split pages: {e}")
                    continue
                
                # Generate embeddings
                print(f"  Generating embeddings for {document_file.name}")
                try:
                    # Use paragraph chunking strategy with default settings
                    embedding_data = embed_document(
                        str(document_file), 
                        str(embeddings_file),
                        chunking_strategy='paragraph'
                    )
                    
                    if embedding_data:
                        print(f"    Generated embeddings for {len(embedding_data['chunks'])} chunks")
                    else:
                        print(f"    Failed to generate embeddings")
                except Exception as e:
                    print(f"    Failed to generate embeddings: {e}")
        
        print(f"  Document processing completed for project {unique_id}")
    else:
        print(f"  Skipping document processing (skip_processed=True)")


def organize_dataset_files():
    """Move files from projects/data/{unique_id}/files to projects/data/{unique_id}/dataset/{file_name}/document.txt"""
    
    projects_dir = Path("projects") / "data"
    
    if not projects_dir.exists():
        print("Projects directory not found")
        return
    
    # Get all project directories (exclude cache and other non-project dirs)
    project_dirs = [d for d in projects_dir.iterdir() 
                   if d.is_dir() and d.name != "_cache" and not d.name.startswith(".")]
    
    for project_dir in project_dirs:
        print(f"\nProcessing project: {project_dir.name}")
        
        files_dir = project_dir / "files"
        dataset_dir = project_dir / "dataset"
        
        # Check if files directory exists and has files
        if not files_dir.exists():
            logger.info(f"  No files directory found, skipping...")
            continue
            
        files = list(files_dir.glob("*"))
        if not files:
            logger.info(f"  Files directory is empty, skipping...")
            continue
            
        # Create dataset directory if it doesn't exist
        dataset_dir.mkdir(exist_ok=True)
        
        # Move each file to its own directory
        for file_path in files:
            if file_path.is_file():
                # Get filename without extension as directory name
                file_name_without_ext = file_path.stem
                target_dir = dataset_dir / file_name_without_ext
                target_file = target_dir / "document.txt"
                
                logger.info(f"  Copying {file_path.name} -> {target_file.relative_to(project_dir)}")
                
                # Create target directory
                target_dir.mkdir(exist_ok=True)
                
                # Copy and rename file
                shutil.copy2(str(file_path), str(target_file))
                
        print(f"  Files remain in original location (copied to dataset structure)")
    
    print("\nFile organization complete!")

if __name__ == "__main__":
    organize_dataset_files()