qwen_agent/utils/single_file_processor.py
2025-11-27 21:50:03 +08:00

301 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Single file processing functions for handling individual files.
"""
import os
import tempfile
import zipfile
import logging
from typing import Dict, List, Tuple, Optional
from pathlib import Path
# Configure logger
logger = logging.getLogger('app')
from utils.file_utils import download_file
# Try to import excel/csv processor, but handle if dependencies are missing
try:
from utils.excel_csv_processor import (
is_excel_file, is_csv_file, process_excel_file, process_csv_file
)
EXCEL_CSV_SUPPORT = True
except ImportError as e:
logger.warning(f"Excel/CSV processing not available: {e}")
EXCEL_CSV_SUPPORT = False
# Fallback functions
def is_excel_file(file_path):
return file_path.lower().endswith(('.xlsx', '.xls'))
def is_csv_file(file_path):
return file_path.lower().endswith('.csv')
def process_excel_file(file_path):
return "", []
def process_csv_file(file_path):
return "", []
async def process_single_file(
unique_id: str,
group_name: str,
filename: str,
original_path: str,
local_path: str
) -> Dict:
"""
Process a single file and generate document.txt, pagination.txt, and embedding.pkl.
Returns:
Dict with processing results and file paths
"""
# Create output directory for this file
filename_stem = Path(filename).stem
output_dir = os.path.join("projects", "data", unique_id, "processed", group_name, filename_stem)
os.makedirs(output_dir, exist_ok=True)
result = {
"success": False,
"filename": filename,
"group": group_name,
"output_dir": output_dir,
"document_path": os.path.join(output_dir, "document.txt"),
"pagination_path": os.path.join(output_dir, "pagination.txt"),
"embedding_path": os.path.join(output_dir, "embedding.pkl"),
"error": None,
"content_size": 0,
"pagination_lines": 0,
"embedding_chunks": 0
}
try:
# Download file if it's remote and not yet downloaded
if original_path.startswith(('http://', 'https://')):
if not os.path.exists(local_path):
logger.info(f"Downloading {original_path} -> {local_path}")
success = await download_file(original_path, local_path)
if not success:
result["error"] = "Failed to download file"
return result
# Extract content from file
content, pagination_lines = await extract_file_content(local_path, filename)
if not content or not content.strip():
result["error"] = "No content extracted from file"
return result
# Write document.txt
with open(result["document_path"], 'w', encoding='utf-8') as f:
f.write(content)
result["content_size"] = len(content)
# Write pagination.txt
if pagination_lines:
with open(result["pagination_path"], 'w', encoding='utf-8') as f:
for line in pagination_lines:
if line.strip():
f.write(f"{line}\n")
result["pagination_lines"] = len(pagination_lines)
else:
# Generate pagination from text content
pagination_lines = generate_pagination_from_text(result["document_path"],
result["pagination_path"])
result["pagination_lines"] = len(pagination_lines)
# Generate embeddings
try:
embedding_chunks = await generate_embeddings_for_file(
result["document_path"], result["embedding_path"]
)
result["embedding_chunks"] = len(embedding_chunks) if embedding_chunks else 0
result["success"] = True
except Exception as e:
result["error"] = f"Embedding generation failed: {str(e)}"
logger.error(f"Failed to generate embeddings for {filename}: {str(e)}")
except Exception as e:
result["error"] = f"File processing failed: {str(e)}"
logger.error(f"Error processing file {filename}: {str(e)}")
return result
async def extract_file_content(file_path: str, filename: str) -> Tuple[str, List[str]]:
"""Extract content from various file formats."""
# Handle zip files
if filename.lower().endswith('.zip'):
return await extract_from_zip(file_path, filename)
# Handle Excel files
elif is_excel_file(file_path):
return await extract_from_excel(file_path, filename)
# Handle CSV files
elif is_csv_file(file_path):
return await extract_from_csv(file_path, filename)
# Handle text files
else:
return await extract_from_text(file_path, filename)
async def extract_from_zip(zip_path: str, filename: str) -> Tuple[str, List[str]]:
"""Extract content from zip file."""
content_parts = []
pagination_lines = []
try:
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
# Extract to temporary directory
temp_dir = tempfile.mkdtemp(prefix=f"extract_{Path(filename).stem}_")
zip_ref.extractall(temp_dir)
# Process extracted files
for root, dirs, files in os.walk(temp_dir):
for file in files:
if file.lower().endswith(('.txt', '.md', '.xlsx', '.xls', '.csv')):
file_path = os.path.join(root, file)
try:
file_content, file_pagination = await extract_file_content(file_path, file)
if file_content:
content_parts.append(f"# Page {file}")
content_parts.append(file_content)
pagination_lines.extend(file_pagination)
except Exception as e:
logger.error(f"Error processing extracted file {file}: {str(e)}")
# Clean up temporary directory
import shutil
shutil.rmtree(temp_dir)
except Exception as e:
logger.error(f"Error extracting zip file {filename}: {str(e)}")
return "", []
return '\n\n'.join(content_parts), pagination_lines
async def extract_from_excel(file_path: str, filename: str) -> Tuple[str, List[str]]:
"""Extract content from Excel file."""
try:
document_content, pagination_lines = process_excel_file(file_path)
if document_content:
content = f"# Page {filename}\n{document_content}"
return content, pagination_lines
else:
return "", []
except Exception as e:
logger.error(f"Error processing Excel file {filename}: {str(e)}")
return "", []
async def extract_from_csv(file_path: str, filename: str) -> Tuple[str, List[str]]:
"""Extract content from CSV file."""
try:
document_content, pagination_lines = process_csv_file(file_path)
if document_content:
content = f"# Page {filename}\n{document_content}"
return content, pagination_lines
else:
return "", []
except Exception as e:
logger.error(f"Error processing CSV file {filename}: {str(e)}")
return "", []
async def extract_from_text(file_path: str, filename: str) -> Tuple[str, List[str]]:
"""Extract content from text file."""
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read().strip()
if content:
return content, []
else:
return "", []
except Exception as e:
logger.error(f"Error reading text file {filename}: {str(e)}")
return "", []
def generate_pagination_from_text(document_path: str, pagination_path: str) -> List[str]:
"""Generate pagination from text document."""
try:
# Import embedding module for pagination
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'embedding'))
from embedding import split_document_by_pages
pages = split_document_by_pages(str(document_path), str(pagination_path))
# Return pagination lines
pagination_lines = []
with open(pagination_path, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
pagination_lines.append(line.strip())
return pagination_lines
except Exception as e:
logger.error(f"Error generating pagination from text: {str(e)}")
return []
async def generate_embeddings_for_file(document_path: str, embedding_path: str) -> Optional[List]:
"""Generate embeddings for a document."""
try:
# Import embedding module
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'embedding'))
from embedding import embed_document
# Generate embeddings using paragraph chunking
embedding_data = embed_document(
str(document_path),
str(embedding_path),
chunking_strategy='paragraph'
)
if embedding_data and 'chunks' in embedding_data:
return embedding_data['chunks']
else:
return None
except Exception as e:
logger.error(f"Error generating embeddings: {str(e)}")
return None
def check_file_already_processed(unique_id: str, group_name: str, filename: str) -> bool:
"""Check if a file has already been processed."""
filename_stem = Path(filename).stem
output_dir = os.path.join("projects", "data", unique_id, "processed", group_name, filename_stem)
document_path = os.path.join(output_dir, "document.txt")
pagination_path = os.path.join(output_dir, "pagination.txt")
embedding_path = os.path.join(output_dir, "embedding.pkl")
# Check if all files exist and are not empty
if (os.path.exists(document_path) and os.path.exists(pagination_path) and
os.path.exists(embedding_path)):
if (os.path.getsize(document_path) > 0 and os.path.getsize(pagination_path) > 0 and
os.path.getsize(embedding_path) > 0):
return True
return False