qwen_agent/utils/excel_csv_processor.py
朱潮 425f3c5bb4 chore: replace Chinese comments and log messages with English
Convert all Chinese comments, docstrings, logger/print output,
HTTPException detail messages, and API response messages to English
across the entire codebase. Functional zh/ja localized strings
(e.g. prompt templates, timezone display names, date formats) are
preserved as-is.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-30 19:45:35 +08:00

258 lines
7.6 KiB
Python

#!/usr/bin/env python3
"""
Excel and CSV file processor for converting data to document.txt and pagination.txt formats.
"""
import os
import pandas as pd
import logging
from typing import List, Dict, Any, Tuple
# Configure logging
logger = logging.getLogger('app')
def read_excel_sheets(file_path: str) -> Dict[str, List[Dict[str, Any]]]:
"""
Read all sheets from an Excel file.
Args:
file_path: Path to the Excel file
Returns:
Dict: Keys are sheet names, values are lists of row data dictionaries
"""
try:
# Read all sheets
excel_file = pd.ExcelFile(file_path)
sheets_data = {}
for sheet_name in excel_file.sheet_names:
try:
# Read data for each sheet
df = pd.read_excel(file_path, sheet_name=sheet_name)
# Convert to list of dicts, skipping NaN values
sheet_data = []
for _, row in df.iterrows():
# Convert NaN values to empty strings
row_dict = {}
for col in df.columns:
value = row[col]
if pd.isna(value):
value = ""
elif isinstance(value, (int, float)):
value = str(value)
else:
value = str(value).strip()
row_dict[str(col)] = value
# Only add non-empty rows
if any(v.strip() for v in row_dict.values()):
sheet_data.append(row_dict)
sheets_data[sheet_name] = sheet_data
logger.info(f"Read Excel sheet '{sheet_name}': {len(sheet_data)} rows of data")
except Exception as e:
logger.error(f"Failed to read Excel sheet '{sheet_name}': {str(e)}")
continue
return sheets_data
except Exception as e:
logger.error(f"Failed to read Excel file: {str(e)}")
return {}
def read_csv_file(file_path: str, encoding: str = 'utf-8') -> List[Dict[str, Any]]:
"""
Read a CSV file.
Args:
file_path: Path to the CSV file
encoding: File encoding
Returns:
List: CSV data as a list of dictionaries
"""
try:
# Try different encodings
encodings_to_try = [encoding, 'utf-8', 'gbk', 'gb2312', 'utf-8-sig']
for enc in encodings_to_try:
try:
df = pd.read_csv(file_path, encoding=enc)
break
except UnicodeDecodeError:
continue
else:
# If all encodings fail, use default encoding and ignore errors
df = pd.read_csv(file_path, encoding='utf-8', errors='ignore')
# Convert to list of dicts, skipping NaN values
csv_data = []
for _, row in df.iterrows():
# Convert NaN values to empty strings
row_dict = {}
for col in df.columns:
value = row[col]
if pd.isna(value):
value = ""
elif isinstance(value, (int, float)):
value = str(value)
else:
value = str(value).strip()
row_dict[str(col)] = value
# Only add non-empty rows
if any(v.strip() for v in row_dict.values()):
csv_data.append(row_dict)
logger.info(f"Read CSV file: {len(csv_data)} rows of data")
return csv_data
except Exception as e:
logger.error(f"Failed to read CSV file: {str(e)}")
return []
def convert_to_markdown_format(data_list: List[Dict[str, Any]], sheet_name: str = "") -> str:
"""
Convert data to markdown format.
Args:
data_list: List of data dictionaries
sheet_name: Sheet name (optional)
Returns:
str: Markdown formatted text
"""
if not data_list:
return ""
markdown_content = []
# Add sheet title
if sheet_name:
markdown_content.append(f"# Sheet: {sheet_name}")
markdown_content.append("")
for i, row_data in enumerate(data_list, 1):
# Generate markdown for each row
row_markdown = []
for key, value in row_data.items():
if value and value.strip(): # Only include non-empty values
row_markdown.append(f"{key}: {value.strip()}")
if row_markdown:
markdown_content.extend(row_markdown)
# Add separator between rows, except for the last row
if i < len(data_list):
markdown_content.append("---")
markdown_content.append("")
return "\n".join(markdown_content)
def convert_to_pagination_format(data_list: List[Dict[str, Any]]) -> List[str]:
"""
Convert data to key:value;key:value format.
Args:
data_list: List of data dictionaries
Returns:
List: Pagination formatted text lines
"""
if not data_list:
return []
pagination_lines = []
for row_data in data_list:
# Generate pagination format for each row
row_pairs = []
for key, value in row_data.items():
if value and value.strip(): # Only include non-empty values
# Remove semicolons and newlines from values to avoid format issues
clean_value = str(value).replace(';', ',').replace('\n', ' ').strip()
if clean_value:
row_pairs.append(f"{key}:{clean_value}")
if row_pairs:
pagination_line = ";".join(row_pairs)
pagination_lines.append(pagination_line)
return pagination_lines
def process_excel_file(file_path: str) -> Tuple[str, List[str]]:
"""
Process an Excel file and generate document.txt and pagination.txt content.
Args:
file_path: Path to the Excel file
Returns:
Tuple: (document_content, pagination_lines)
"""
sheets_data = read_excel_sheets(file_path)
document_content_parts = []
pagination_lines = []
# Process each sheet
for sheet_name, sheet_data in sheets_data.items():
if sheet_data:
# Generate markdown format document content
markdown_content = convert_to_markdown_format(sheet_data, sheet_name)
if markdown_content:
document_content_parts.append(markdown_content)
# Generate pagination format content
sheet_pagination_lines = convert_to_pagination_format(sheet_data)
pagination_lines.extend(sheet_pagination_lines)
# Merge document content from all sheets
document_content = "\n\n".join(document_content_parts)
return document_content, pagination_lines
def process_csv_file(file_path: str) -> Tuple[str, List[str]]:
"""
Process a CSV file and generate document.txt and pagination.txt content.
Args:
file_path: Path to the CSV file
Returns:
Tuple: (document_content, pagination_lines)
"""
csv_data = read_csv_file(file_path)
if not csv_data:
return "", []
# Generate markdown format document content
document_content = convert_to_markdown_format(csv_data)
# Generate pagination format content
pagination_lines = convert_to_pagination_format(csv_data)
return document_content, pagination_lines
def is_excel_file(file_path: str) -> bool:
"""Check if the file is an Excel file."""
return file_path.lower().endswith(('.xlsx', '.xls'))
def is_csv_file(file_path: str) -> bool:
"""Check if the file is a CSV file."""
return file_path.lower().endswith('.csv')