qwen_agent/utils/organize_dataset_files.py

#!/usr/bin/env python3
import os
import shutil
from pathlib import Path

def is_file_already_processed(target_file: Path, pagination_file: Path, embeddings_file: Path) -> bool:
    """Check if a file has already been processed (document.txt, pagination.txt, and embeddings exist)"""
    if not target_file.exists():
        return False

    # Check if pagination and embeddings files exist and are not empty
    if pagination_file.exists() and embeddings_file.exists():
        # Check file sizes to ensure they're not empty
        if pagination_file.stat().st_size > 0 and embeddings_file.stat().st_size > 0:
            return True

    return False

def organize_single_project_files(unique_id: str, skip_processed=True):
    """Organize files for a single project from projects/{unique_id}/files to projects/{unique_id}/dataset/{file_name}/document.txt"""

    project_dir = Path("projects") / unique_id

    if not project_dir.exists():
        print(f"Project directory not found: {project_dir}")
        return

    print(f"Organizing files for project: {unique_id} (skip_processed={skip_processed})")

    files_dir = project_dir / "files"
    dataset_dir = project_dir / "dataset"

    # Check if files directory exists and has files
    if not files_dir.exists():
        print(f"  No files directory found, skipping...")
        return

    files = list(files_dir.glob("*"))
    if not files:
        print(f"  Files directory is empty, skipping...")
        return

    # Create dataset directory if it doesn't exist
    dataset_dir.mkdir(exist_ok=True)

    # Copy each file to its own directory
    for file_path in files:
        if file_path.is_file():
            # Get filename without extension as directory name
            file_name_without_ext = file_path.stem
            target_dir = dataset_dir / file_name_without_ext
            target_file = target_dir / "document.txt"
            pagination_file = target_dir / "pagination.txt"
            embeddings_file = target_dir / "embedding.pkl"

            # Check if file is already processed
            if skip_processed and is_file_already_processed(target_file, pagination_file, embeddings_file):
                print(f"  Skipping already processed file: {file_path.name}")
                continue

            print(f"  Copying {file_path.name} -> {target_file.relative_to(project_dir)}")

            # Create target directory
            target_dir.mkdir(exist_ok=True)

            # Copy and rename file
            shutil.copy2(str(file_path), str(target_file))

    print(f"  Files remain in original location (copied to dataset structure)")

    # Process each document.txt file: split pages and generate embeddings
    if not skip_processed:
        import sys
        sys.path.append(os.path.join(os.path.dirname(__file__), 'embedding'))

        from embedding import split_document_by_pages, embed_document

        for file_path in files:
            if file_path.is_file():
                file_name_without_ext = file_path.stem
                target_dir = dataset_dir / file_name_without_ext
                document_file = target_dir / "document.txt"
                pagination_file = target_dir / "pagination.txt"
                embeddings_file = target_dir / "embedding.pkl"

                # Skip if already processed
                if is_file_already_processed(document_file, pagination_file, embeddings_file):
                    print(f"  Skipping document processing for already processed file: {file_path.name}")
                    continue

                # Split document by pages
                print(f"  Splitting pages for {document_file.name}")
                try:
                    pages = split_document_by_pages(str(document_file), str(pagination_file))
                    print(f"    Generated {len(pages)} pages")
                except Exception as e:
                    print(f"    Failed to split pages: {e}")
                    continue

                # Generate embeddings
                print(f"  Generating embeddings for {document_file.name}")
                try:
                    # Use paragraph chunking strategy with default settings
                    embedding_data = embed_document(
                        str(document_file),
                        str(embeddings_file),
                        chunking_strategy='paragraph'
                    )

                    if embedding_data:
                        print(f"    Generated embeddings for {len(embedding_data['chunks'])} chunks")
                    else:
                        print(f"    Failed to generate embeddings")
                except Exception as e:
                    print(f"    Failed to generate embeddings: {e}")

        print(f"  Document processing completed for project {unique_id}")
    else:
        print(f"  Skipping document processing (skip_processed=True)")


def organize_dataset_files():
    """Move files from projects/{unique_id}/files to projects/{unique_id}/dataset/{file_name}/document.txt"""

    projects_dir = Path("projects")

    if not projects_dir.exists():
        print("Projects directory not found")
        return

    # Get all project directories (exclude cache and other non-project dirs)
    project_dirs = [d for d in projects_dir.iterdir()
                   if d.is_dir() and d.name != "_cache" and not d.name.startswith(".")]

    for project_dir in project_dirs:
        print(f"\nProcessing project: {project_dir.name}")

        files_dir = project_dir / "files"
        dataset_dir = project_dir / "dataset"

        # Check if files directory exists and has files
        if not files_dir.exists():
            print(f"  No files directory found, skipping...")
            continue

        files = list(files_dir.glob("*"))
        if not files:
            print(f"  Files directory is empty, skipping...")
            continue

        # Create dataset directory if it doesn't exist
        dataset_dir.mkdir(exist_ok=True)

        # Move each file to its own directory
        for file_path in files:
            if file_path.is_file():
                # Get filename without extension as directory name
                file_name_without_ext = file_path.stem
                target_dir = dataset_dir / file_name_without_ext
                target_file = target_dir / "document.txt"

                print(f"  Copying {file_path.name} -> {target_file.relative_to(project_dir)}")

                # Create target directory
                target_dir.mkdir(exist_ok=True)

                # Copy and rename file
                shutil.copy2(str(file_path), str(target_file))

        print(f"  Files remain in original location (copied to dataset structure)")

    print("\nFile organization complete!")

if __name__ == "__main__":
    organize_dataset_files()