176 lines
7.0 KiB
Python
176 lines
7.0 KiB
Python
#!/usr/bin/env python3
|
|
import os
|
|
import shutil
|
|
from pathlib import Path
|
|
|
|
def is_file_already_processed(target_file: Path, pagination_file: Path, embeddings_file: Path) -> bool:
|
|
"""Check if a file has already been processed (document.txt, pagination.txt, and embeddings exist)"""
|
|
if not target_file.exists():
|
|
return False
|
|
|
|
# Check if pagination and embeddings files exist and are not empty
|
|
if pagination_file.exists() and embeddings_file.exists():
|
|
# Check file sizes to ensure they're not empty
|
|
if pagination_file.stat().st_size > 0 and embeddings_file.stat().st_size > 0:
|
|
return True
|
|
|
|
return False
|
|
|
|
def organize_single_project_files(unique_id: str, skip_processed=True):
|
|
"""Organize files for a single project from projects/{unique_id}/files to projects/{unique_id}/dataset/{file_name}/document.txt"""
|
|
|
|
project_dir = Path("projects") / unique_id
|
|
|
|
if not project_dir.exists():
|
|
print(f"Project directory not found: {project_dir}")
|
|
return
|
|
|
|
print(f"Organizing files for project: {unique_id} (skip_processed={skip_processed})")
|
|
|
|
files_dir = project_dir / "files"
|
|
dataset_dir = project_dir / "dataset"
|
|
|
|
# Check if files directory exists and has files
|
|
if not files_dir.exists():
|
|
print(f" No files directory found, skipping...")
|
|
return
|
|
|
|
files = list(files_dir.glob("*"))
|
|
if not files:
|
|
print(f" Files directory is empty, skipping...")
|
|
return
|
|
|
|
# Create dataset directory if it doesn't exist
|
|
dataset_dir.mkdir(exist_ok=True)
|
|
|
|
# Copy each file to its own directory
|
|
for file_path in files:
|
|
if file_path.is_file():
|
|
# Get filename without extension as directory name
|
|
file_name_without_ext = file_path.stem
|
|
target_dir = dataset_dir / file_name_without_ext
|
|
target_file = target_dir / "document.txt"
|
|
pagination_file = target_dir / "pagination.txt"
|
|
embeddings_file = target_dir / "embedding.pkl"
|
|
|
|
# Check if file is already processed
|
|
if skip_processed and is_file_already_processed(target_file, pagination_file, embeddings_file):
|
|
print(f" Skipping already processed file: {file_path.name}")
|
|
continue
|
|
|
|
print(f" Copying {file_path.name} -> {target_file.relative_to(project_dir)}")
|
|
|
|
# Create target directory
|
|
target_dir.mkdir(exist_ok=True)
|
|
|
|
# Copy and rename file
|
|
shutil.copy2(str(file_path), str(target_file))
|
|
|
|
print(f" Files remain in original location (copied to dataset structure)")
|
|
|
|
# Process each document.txt file: split pages and generate embeddings
|
|
if not skip_processed:
|
|
import sys
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), 'embedding'))
|
|
|
|
from embedding import split_document_by_pages, embed_document
|
|
|
|
for file_path in files:
|
|
if file_path.is_file():
|
|
file_name_without_ext = file_path.stem
|
|
target_dir = dataset_dir / file_name_without_ext
|
|
document_file = target_dir / "document.txt"
|
|
pagination_file = target_dir / "pagination.txt"
|
|
embeddings_file = target_dir / "embedding.pkl"
|
|
|
|
# Skip if already processed
|
|
if is_file_already_processed(document_file, pagination_file, embeddings_file):
|
|
print(f" Skipping document processing for already processed file: {file_path.name}")
|
|
continue
|
|
|
|
# Split document by pages
|
|
print(f" Splitting pages for {document_file.name}")
|
|
try:
|
|
pages = split_document_by_pages(str(document_file), str(pagination_file))
|
|
print(f" Generated {len(pages)} pages")
|
|
except Exception as e:
|
|
print(f" Failed to split pages: {e}")
|
|
continue
|
|
|
|
# Generate embeddings
|
|
print(f" Generating embeddings for {document_file.name}")
|
|
try:
|
|
# Use paragraph chunking strategy with default settings
|
|
embedding_data = embed_document(
|
|
str(document_file),
|
|
str(embeddings_file),
|
|
chunking_strategy='paragraph'
|
|
)
|
|
|
|
if embedding_data:
|
|
print(f" Generated embeddings for {len(embedding_data['chunks'])} chunks")
|
|
else:
|
|
print(f" Failed to generate embeddings")
|
|
except Exception as e:
|
|
print(f" Failed to generate embeddings: {e}")
|
|
|
|
print(f" Document processing completed for project {unique_id}")
|
|
else:
|
|
print(f" Skipping document processing (skip_processed=True)")
|
|
|
|
|
|
def organize_dataset_files():
|
|
"""Move files from projects/{unique_id}/files to projects/{unique_id}/dataset/{file_name}/document.txt"""
|
|
|
|
projects_dir = Path("projects")
|
|
|
|
if not projects_dir.exists():
|
|
print("Projects directory not found")
|
|
return
|
|
|
|
# Get all project directories (exclude cache and other non-project dirs)
|
|
project_dirs = [d for d in projects_dir.iterdir()
|
|
if d.is_dir() and d.name != "_cache" and not d.name.startswith(".")]
|
|
|
|
for project_dir in project_dirs:
|
|
print(f"\nProcessing project: {project_dir.name}")
|
|
|
|
files_dir = project_dir / "files"
|
|
dataset_dir = project_dir / "dataset"
|
|
|
|
# Check if files directory exists and has files
|
|
if not files_dir.exists():
|
|
print(f" No files directory found, skipping...")
|
|
continue
|
|
|
|
files = list(files_dir.glob("*"))
|
|
if not files:
|
|
print(f" Files directory is empty, skipping...")
|
|
continue
|
|
|
|
# Create dataset directory if it doesn't exist
|
|
dataset_dir.mkdir(exist_ok=True)
|
|
|
|
# Move each file to its own directory
|
|
for file_path in files:
|
|
if file_path.is_file():
|
|
# Get filename without extension as directory name
|
|
file_name_without_ext = file_path.stem
|
|
target_dir = dataset_dir / file_name_without_ext
|
|
target_file = target_dir / "document.txt"
|
|
|
|
print(f" Copying {file_path.name} -> {target_file.relative_to(project_dir)}")
|
|
|
|
# Create target directory
|
|
target_dir.mkdir(exist_ok=True)
|
|
|
|
# Copy and rename file
|
|
shutil.copy2(str(file_path), str(target_file))
|
|
|
|
print(f" Files remain in original location (copied to dataset structure)")
|
|
|
|
print("\nFile organization complete!")
|
|
|
|
if __name__ == "__main__":
|
|
organize_dataset_files()
|