301 lines
10 KiB
Python
301 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Single file processing functions for handling individual files.
|
|
"""
|
|
|
|
import os
|
|
import tempfile
|
|
import zipfile
|
|
import logging
|
|
from typing import Dict, List, Tuple, Optional
|
|
from pathlib import Path
|
|
|
|
# Configure logger
|
|
logger = logging.getLogger('app')
|
|
|
|
from utils.file_utils import download_file
|
|
|
|
# Try to import excel/csv processor, but handle if dependencies are missing
|
|
try:
|
|
from utils.excel_csv_processor import (
|
|
is_excel_file, is_csv_file, process_excel_file, process_csv_file
|
|
)
|
|
EXCEL_CSV_SUPPORT = True
|
|
except ImportError as e:
|
|
logger.warning(f"Excel/CSV processing not available: {e}")
|
|
EXCEL_CSV_SUPPORT = False
|
|
|
|
# Fallback functions
|
|
def is_excel_file(file_path):
|
|
return file_path.lower().endswith(('.xlsx', '.xls'))
|
|
|
|
def is_csv_file(file_path):
|
|
return file_path.lower().endswith('.csv')
|
|
|
|
def process_excel_file(file_path):
|
|
return "", []
|
|
|
|
def process_csv_file(file_path):
|
|
return "", []
|
|
|
|
|
|
async def process_single_file(
|
|
unique_id: str,
|
|
group_name: str,
|
|
filename: str,
|
|
original_path: str,
|
|
local_path: str
|
|
) -> Dict:
|
|
"""
|
|
Process a single file and generate document.txt, pagination.txt, and embedding.pkl.
|
|
|
|
Returns:
|
|
Dict with processing results and file paths
|
|
"""
|
|
# Create output directory for this file
|
|
filename_stem = Path(filename).stem
|
|
output_dir = os.path.join("projects", "data", unique_id, "processed", group_name, filename_stem)
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
result = {
|
|
"success": False,
|
|
"filename": filename,
|
|
"group": group_name,
|
|
"output_dir": output_dir,
|
|
"document_path": os.path.join(output_dir, "document.txt"),
|
|
"pagination_path": os.path.join(output_dir, "pagination.txt"),
|
|
"embedding_path": os.path.join(output_dir, "embedding.pkl"),
|
|
"error": None,
|
|
"content_size": 0,
|
|
"pagination_lines": 0,
|
|
"embedding_chunks": 0
|
|
}
|
|
|
|
try:
|
|
# Download file if it's remote and not yet downloaded
|
|
if original_path.startswith(('http://', 'https://')):
|
|
if not os.path.exists(local_path):
|
|
logger.info(f"Downloading {original_path} -> {local_path}")
|
|
success = await download_file(original_path, local_path)
|
|
if not success:
|
|
result["error"] = "Failed to download file"
|
|
return result
|
|
|
|
# Extract content from file
|
|
content, pagination_lines = await extract_file_content(local_path, filename)
|
|
|
|
if not content or not content.strip():
|
|
result["error"] = "No content extracted from file"
|
|
return result
|
|
|
|
# Write document.txt
|
|
with open(result["document_path"], 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
result["content_size"] = len(content)
|
|
|
|
# Write pagination.txt
|
|
if pagination_lines:
|
|
with open(result["pagination_path"], 'w', encoding='utf-8') as f:
|
|
for line in pagination_lines:
|
|
if line.strip():
|
|
f.write(f"{line}\n")
|
|
result["pagination_lines"] = len(pagination_lines)
|
|
else:
|
|
# Generate pagination from text content
|
|
pagination_lines = generate_pagination_from_text(result["document_path"],
|
|
result["pagination_path"])
|
|
result["pagination_lines"] = len(pagination_lines)
|
|
|
|
# Generate embeddings
|
|
try:
|
|
embedding_chunks = await generate_embeddings_for_file(
|
|
result["document_path"], result["embedding_path"]
|
|
)
|
|
result["embedding_chunks"] = len(embedding_chunks) if embedding_chunks else 0
|
|
result["success"] = True
|
|
|
|
except Exception as e:
|
|
result["error"] = f"Embedding generation failed: {str(e)}"
|
|
logger.error(f"Failed to generate embeddings for {filename}: {str(e)}")
|
|
|
|
except Exception as e:
|
|
result["error"] = f"File processing failed: {str(e)}"
|
|
logger.error(f"Error processing file {filename}: {str(e)}")
|
|
|
|
return result
|
|
|
|
|
|
async def extract_file_content(file_path: str, filename: str) -> Tuple[str, List[str]]:
|
|
"""Extract content from various file formats."""
|
|
|
|
# Handle zip files
|
|
if filename.lower().endswith('.zip'):
|
|
return await extract_from_zip(file_path, filename)
|
|
|
|
# Handle Excel files
|
|
elif is_excel_file(file_path):
|
|
return await extract_from_excel(file_path, filename)
|
|
|
|
# Handle CSV files
|
|
elif is_csv_file(file_path):
|
|
return await extract_from_csv(file_path, filename)
|
|
|
|
# Handle text files
|
|
else:
|
|
return await extract_from_text(file_path, filename)
|
|
|
|
|
|
async def extract_from_zip(zip_path: str, filename: str) -> Tuple[str, List[str]]:
|
|
"""Extract content from zip file."""
|
|
content_parts = []
|
|
pagination_lines = []
|
|
|
|
try:
|
|
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
|
# Extract to temporary directory
|
|
temp_dir = tempfile.mkdtemp(prefix=f"extract_{Path(filename).stem}_")
|
|
zip_ref.extractall(temp_dir)
|
|
|
|
# Process extracted files
|
|
for root, dirs, files in os.walk(temp_dir):
|
|
for file in files:
|
|
if file.lower().endswith(('.txt', '.md', '.xlsx', '.xls', '.csv')):
|
|
file_path = os.path.join(root, file)
|
|
|
|
try:
|
|
file_content, file_pagination = await extract_file_content(file_path, file)
|
|
|
|
if file_content:
|
|
content_parts.append(f"# Page {file}")
|
|
content_parts.append(file_content)
|
|
pagination_lines.extend(file_pagination)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing extracted file {file}: {str(e)}")
|
|
|
|
# Clean up temporary directory
|
|
import shutil
|
|
shutil.rmtree(temp_dir)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error extracting zip file {filename}: {str(e)}")
|
|
return "", []
|
|
|
|
return '\n\n'.join(content_parts), pagination_lines
|
|
|
|
|
|
async def extract_from_excel(file_path: str, filename: str) -> Tuple[str, List[str]]:
|
|
"""Extract content from Excel file."""
|
|
try:
|
|
document_content, pagination_lines = process_excel_file(file_path)
|
|
|
|
if document_content:
|
|
content = f"# Page {filename}\n{document_content}"
|
|
return content, pagination_lines
|
|
else:
|
|
return "", []
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing Excel file {filename}: {str(e)}")
|
|
return "", []
|
|
|
|
|
|
async def extract_from_csv(file_path: str, filename: str) -> Tuple[str, List[str]]:
|
|
"""Extract content from CSV file."""
|
|
try:
|
|
document_content, pagination_lines = process_csv_file(file_path)
|
|
|
|
if document_content:
|
|
content = f"# Page {filename}\n{document_content}"
|
|
return content, pagination_lines
|
|
else:
|
|
return "", []
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing CSV file {filename}: {str(e)}")
|
|
return "", []
|
|
|
|
|
|
async def extract_from_text(file_path: str, filename: str) -> Tuple[str, List[str]]:
|
|
"""Extract content from text file."""
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read().strip()
|
|
|
|
if content:
|
|
return content, []
|
|
else:
|
|
return "", []
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error reading text file {filename}: {str(e)}")
|
|
return "", []
|
|
|
|
|
|
def generate_pagination_from_text(document_path: str, pagination_path: str) -> List[str]:
|
|
"""Generate pagination from text document."""
|
|
try:
|
|
# Import embedding module for pagination
|
|
import sys
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'embedding'))
|
|
from embedding import split_document_by_pages
|
|
|
|
pages = split_document_by_pages(str(document_path), str(pagination_path))
|
|
|
|
# Return pagination lines
|
|
pagination_lines = []
|
|
with open(pagination_path, 'r', encoding='utf-8') as f:
|
|
for line in f:
|
|
if line.strip():
|
|
pagination_lines.append(line.strip())
|
|
|
|
return pagination_lines
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error generating pagination from text: {str(e)}")
|
|
return []
|
|
|
|
|
|
async def generate_embeddings_for_file(document_path: str, embedding_path: str) -> Optional[List]:
|
|
"""Generate embeddings for a document."""
|
|
try:
|
|
# Import embedding module
|
|
import sys
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'embedding'))
|
|
from embedding import embed_document
|
|
|
|
# Generate embeddings using paragraph chunking
|
|
embedding_data = embed_document(
|
|
str(document_path),
|
|
str(embedding_path),
|
|
chunking_strategy='paragraph'
|
|
)
|
|
|
|
if embedding_data and 'chunks' in embedding_data:
|
|
return embedding_data['chunks']
|
|
else:
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error generating embeddings: {str(e)}")
|
|
return None
|
|
|
|
|
|
def check_file_already_processed(unique_id: str, group_name: str, filename: str) -> bool:
|
|
"""Check if a file has already been processed."""
|
|
filename_stem = Path(filename).stem
|
|
output_dir = os.path.join("projects", "data", unique_id, "processed", group_name, filename_stem)
|
|
|
|
document_path = os.path.join(output_dir, "document.txt")
|
|
pagination_path = os.path.join(output_dir, "pagination.txt")
|
|
embedding_path = os.path.join(output_dir, "embedding.pkl")
|
|
|
|
# Check if all files exist and are not empty
|
|
if (os.path.exists(document_path) and os.path.exists(pagination_path) and
|
|
os.path.exists(embedding_path)):
|
|
|
|
if (os.path.getsize(document_path) > 0 and os.path.getsize(pagination_path) > 0 and
|
|
os.path.getsize(embedding_path) > 0):
|
|
return True
|
|
|
|
return False |