#!/usr/bin/env python3 """ Single file processing functions for handling individual files. """ import os import tempfile import zipfile from typing import Dict, List, Tuple, Optional from pathlib import Path from utils.file_utils import download_file # Try to import excel/csv processor, but handle if dependencies are missing try: from utils.excel_csv_processor import ( is_excel_file, is_csv_file, process_excel_file, process_csv_file ) EXCEL_CSV_SUPPORT = True except ImportError as e: print(f"Excel/CSV processing not available: {e}") EXCEL_CSV_SUPPORT = False # Fallback functions def is_excel_file(file_path): return file_path.lower().endswith(('.xlsx', '.xls')) def is_csv_file(file_path): return file_path.lower().endswith('.csv') def process_excel_file(file_path): return "", [] def process_csv_file(file_path): return "", [] async def process_single_file( unique_id: str, group_name: str, filename: str, original_path: str, local_path: str ) -> Dict: """ Process a single file and generate document.txt, pagination.txt, and embedding.pkl. Returns: Dict with processing results and file paths """ # Create output directory for this file filename_stem = Path(filename).stem output_dir = os.path.join("projects", "data", unique_id, "processed", group_name, filename_stem) os.makedirs(output_dir, exist_ok=True) result = { "success": False, "filename": filename, "group": group_name, "output_dir": output_dir, "document_path": os.path.join(output_dir, "document.txt"), "pagination_path": os.path.join(output_dir, "pagination.txt"), "embedding_path": os.path.join(output_dir, "embedding.pkl"), "error": None, "content_size": 0, "pagination_lines": 0, "embedding_chunks": 0 } try: # Download file if it's remote and not yet downloaded if original_path.startswith(('http://', 'https://')): if not os.path.exists(local_path): print(f"Downloading {original_path} -> {local_path}") success = await download_file(original_path, local_path) if not success: result["error"] = "Failed to download file" return result # Extract content from file content, pagination_lines = await extract_file_content(local_path, filename) if not content or not content.strip(): result["error"] = "No content extracted from file" return result # Write document.txt with open(result["document_path"], 'w', encoding='utf-8') as f: f.write(content) result["content_size"] = len(content) # Write pagination.txt if pagination_lines: with open(result["pagination_path"], 'w', encoding='utf-8') as f: for line in pagination_lines: if line.strip(): f.write(f"{line}\n") result["pagination_lines"] = len(pagination_lines) else: # Generate pagination from text content pagination_lines = generate_pagination_from_text(result["document_path"], result["pagination_path"]) result["pagination_lines"] = len(pagination_lines) # Generate embeddings try: embedding_chunks = await generate_embeddings_for_file( result["document_path"], result["embedding_path"] ) result["embedding_chunks"] = len(embedding_chunks) if embedding_chunks else 0 result["success"] = True except Exception as e: result["error"] = f"Embedding generation failed: {str(e)}" print(f"Failed to generate embeddings for {filename}: {str(e)}") except Exception as e: result["error"] = f"File processing failed: {str(e)}" print(f"Error processing file {filename}: {str(e)}") return result async def extract_file_content(file_path: str, filename: str) -> Tuple[str, List[str]]: """Extract content from various file formats.""" # Handle zip files if filename.lower().endswith('.zip'): return await extract_from_zip(file_path, filename) # Handle Excel files elif is_excel_file(file_path): return await extract_from_excel(file_path, filename) # Handle CSV files elif is_csv_file(file_path): return await extract_from_csv(file_path, filename) # Handle text files else: return await extract_from_text(file_path, filename) async def extract_from_zip(zip_path: str, filename: str) -> Tuple[str, List[str]]: """Extract content from zip file.""" content_parts = [] pagination_lines = [] try: with zipfile.ZipFile(zip_path, 'r') as zip_ref: # Extract to temporary directory temp_dir = tempfile.mkdtemp(prefix=f"extract_{Path(filename).stem}_") zip_ref.extractall(temp_dir) # Process extracted files for root, dirs, files in os.walk(temp_dir): for file in files: if file.lower().endswith(('.txt', '.md', '.xlsx', '.xls', '.csv')): file_path = os.path.join(root, file) try: file_content, file_pagination = await extract_file_content(file_path, file) if file_content: content_parts.append(f"# Page {file}") content_parts.append(file_content) pagination_lines.extend(file_pagination) except Exception as e: print(f"Error processing extracted file {file}: {str(e)}") # Clean up temporary directory import shutil shutil.rmtree(temp_dir) except Exception as e: print(f"Error extracting zip file {filename}: {str(e)}") return "", [] return '\n\n'.join(content_parts), pagination_lines async def extract_from_excel(file_path: str, filename: str) -> Tuple[str, List[str]]: """Extract content from Excel file.""" try: document_content, pagination_lines = process_excel_file(file_path) if document_content: content = f"# Page {filename}\n{document_content}" return content, pagination_lines else: return "", [] except Exception as e: print(f"Error processing Excel file {filename}: {str(e)}") return "", [] async def extract_from_csv(file_path: str, filename: str) -> Tuple[str, List[str]]: """Extract content from CSV file.""" try: document_content, pagination_lines = process_csv_file(file_path) if document_content: content = f"# Page {filename}\n{document_content}" return content, pagination_lines else: return "", [] except Exception as e: print(f"Error processing CSV file {filename}: {str(e)}") return "", [] async def extract_from_text(file_path: str, filename: str) -> Tuple[str, List[str]]: """Extract content from text file.""" try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read().strip() if content: return content, [] else: return "", [] except Exception as e: print(f"Error reading text file {filename}: {str(e)}") return "", [] def generate_pagination_from_text(document_path: str, pagination_path: str) -> List[str]: """Generate pagination from text document.""" try: # Import embedding module for pagination import sys sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'embedding')) from embedding import split_document_by_pages pages = split_document_by_pages(str(document_path), str(pagination_path)) # Return pagination lines pagination_lines = [] with open(pagination_path, 'r', encoding='utf-8') as f: for line in f: if line.strip(): pagination_lines.append(line.strip()) return pagination_lines except Exception as e: print(f"Error generating pagination from text: {str(e)}") return [] async def generate_embeddings_for_file(document_path: str, embedding_path: str) -> Optional[List]: """Generate embeddings for a document.""" try: # Import embedding module import sys sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'embedding')) from embedding import embed_document # Generate embeddings using paragraph chunking embedding_data = embed_document( str(document_path), str(embedding_path), chunking_strategy='paragraph' ) if embedding_data and 'chunks' in embedding_data: return embedding_data['chunks'] else: return None except Exception as e: print(f"Error generating embeddings: {str(e)}") return None def check_file_already_processed(unique_id: str, group_name: str, filename: str) -> bool: """Check if a file has already been processed.""" filename_stem = Path(filename).stem output_dir = os.path.join("projects", "data", unique_id, "processed", group_name, filename_stem) document_path = os.path.join(output_dir, "document.txt") pagination_path = os.path.join(output_dir, "pagination.txt") embedding_path = os.path.join(output_dir, "embedding.pkl") # Check if all files exist and are not empty if (os.path.exists(document_path) and os.path.exists(pagination_path) and os.path.exists(embedding_path)): if (os.path.getsize(document_path) > 0 and os.path.getsize(pagination_path) > 0 and os.path.getsize(embedding_path) > 0): return True return False