#!/usr/bin/env python3 import os import shutil import logging from pathlib import Path # 配置日志 logger = logging.getLogger('app') def is_file_already_processed(target_file: Path, pagination_file: Path, embeddings_file: Path) -> bool: """Check if a file has already been processed (document.txt, pagination.txt, and embeddings exist)""" if not target_file.exists(): return False # Check if pagination and embeddings files exist and are not empty if pagination_file.exists() and embeddings_file.exists(): # Check file sizes to ensure they're not empty if pagination_file.stat().st_size > 0 and embeddings_file.stat().st_size > 0: return True return False def organize_single_project_files(unique_id: str, skip_processed=True): """Organize files for a single project from projects/data/{unique_id}/files to projects/data/{unique_id}/dataset/{file_name}/document.txt""" project_dir = Path("projects") / "data" / unique_id if not project_dir.exists(): logger.error(f"Project directory not found: {project_dir}") return logger.info(f"Organizing files for project: {unique_id} (skip_processed={skip_processed})") files_dir = project_dir / "files" dataset_dir = project_dir / "dataset" # Check if files directory exists and has files if not files_dir.exists(): logger.info(f" No files directory found, skipping...") return files = list(files_dir.glob("*")) if not files: logger.info(f" Files directory is empty, skipping...") return # Create dataset directory if it doesn't exist dataset_dir.mkdir(exist_ok=True) # Copy each file to its own directory for file_path in files: if file_path.is_file(): # Get filename without extension as directory name file_name_without_ext = file_path.stem target_dir = dataset_dir / file_name_without_ext target_file = target_dir / "document.txt" pagination_file = target_dir / "pagination.txt" embeddings_file = target_dir / "embedding.pkl" # Check if file is already processed if skip_processed and is_file_already_processed(target_file, pagination_file, embeddings_file): logger.info(f" Skipping already processed file: {file_path.name}") continue logger.info(f" Copying {file_path.name} -> {target_file.relative_to(project_dir)}") # Create target directory target_dir.mkdir(exist_ok=True) # Copy and rename file shutil.copy2(str(file_path), str(target_file)) print(f" Files remain in original location (copied to dataset structure)") # Process each document.txt file: split pages and generate embeddings if not skip_processed: import sys sys.path.append(os.path.join(os.path.dirname(__file__), 'embedding')) from embedding import split_document_by_pages, embed_document for file_path in files: if file_path.is_file(): file_name_without_ext = file_path.stem target_dir = dataset_dir / file_name_without_ext document_file = target_dir / "document.txt" pagination_file = target_dir / "pagination.txt" embeddings_file = target_dir / "embedding.pkl" # Skip if already processed if is_file_already_processed(document_file, pagination_file, embeddings_file): print(f" Skipping document processing for already processed file: {file_path.name}") continue # Split document by pages print(f" Splitting pages for {document_file.name}") try: pages = split_document_by_pages(str(document_file), str(pagination_file)) print(f" Generated {len(pages)} pages") except Exception as e: print(f" Failed to split pages: {e}") continue # Generate embeddings print(f" Generating embeddings for {document_file.name}") try: # Use paragraph chunking strategy with default settings embedding_data = embed_document( str(document_file), str(embeddings_file), chunking_strategy='paragraph' ) if embedding_data: print(f" Generated embeddings for {len(embedding_data['chunks'])} chunks") else: print(f" Failed to generate embeddings") except Exception as e: print(f" Failed to generate embeddings: {e}") print(f" Document processing completed for project {unique_id}") else: print(f" Skipping document processing (skip_processed=True)") def organize_dataset_files(): """Move files from projects/data/{unique_id}/files to projects/data/{unique_id}/dataset/{file_name}/document.txt""" projects_dir = Path("projects") / "data" if not projects_dir.exists(): print("Projects directory not found") return # Get all project directories (exclude cache and other non-project dirs) project_dirs = [d for d in projects_dir.iterdir() if d.is_dir() and d.name != "_cache" and not d.name.startswith(".")] for project_dir in project_dirs: print(f"\nProcessing project: {project_dir.name}") files_dir = project_dir / "files" dataset_dir = project_dir / "dataset" # Check if files directory exists and has files if not files_dir.exists(): logger.info(f" No files directory found, skipping...") continue files = list(files_dir.glob("*")) if not files: logger.info(f" Files directory is empty, skipping...") continue # Create dataset directory if it doesn't exist dataset_dir.mkdir(exist_ok=True) # Move each file to its own directory for file_path in files: if file_path.is_file(): # Get filename without extension as directory name file_name_without_ext = file_path.stem target_dir = dataset_dir / file_name_without_ext target_file = target_dir / "document.txt" logger.info(f" Copying {file_path.name} -> {target_file.relative_to(project_dir)}") # Create target directory target_dir.mkdir(exist_ok=True) # Copy and rename file shutil.copy2(str(file_path), str(target_file)) print(f" Files remain in original location (copied to dataset structure)") print("\nFile organization complete!") if __name__ == "__main__": organize_dataset_files()