qwen_agent/mcp/excel_csv_operator_server.py
朱潮 425f3c5bb4 chore: replace Chinese comments and log messages with English
Convert all Chinese comments, docstrings, logger/print output,
HTTPException detail messages, and API response messages to English
across the entire codebase. Functional zh/ja localized strings
(e.g. prompt templates, timezone display names, date formats) are
preserved as-is.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-30 19:45:35 +08:00

504 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Excel and CSV file operation MCP server
Support reading, searching, and enum-value retrieval for Excel/CSV files
Reference the implementation style of multi_keyword_search_server.py
"""
import json
import os
import sys
import asyncio
import re
import chardet
from typing import Any, Dict, List, Optional, Union
import pandas as pd
from mcp_common import (
get_allowed_directory,
load_tools_from_json,
resolve_file_path,
find_file_in_project,
is_regex_pattern,
compile_pattern,
create_error_response,
create_success_response,
create_initialize_response,
create_ping_response,
create_tools_list_response,
handle_mcp_streaming
)
def detect_encoding(file_path: str) -> str:
"""Detect file encoding"""
try:
with open(file_path, 'rb') as f:
raw_data = f.read(10000) # Read the first 10 KB to detect encoding
result = chardet.detect(raw_data)
return result['encoding'] or 'utf-8'
except:
return 'utf-8'
class ExcelCSVOperator:
"""Core class for Excel and CSV file operations"""
def __init__(self):
self.supported_extensions = ['.xlsx', '.xls', '.csv']
self.encoding_cache = {}
def _validate_file(self, file_path: str) -> str:
"""Validate and process the file path"""
# Resolve file paths, supporting folder/document.txt and document.txt formats
resolved_path = resolve_file_path(file_path)
# Validate the file extension
file_ext = os.path.splitext(resolved_path)[1].lower()
if file_ext not in self.supported_extensions:
raise ValueError(f"Unsupported file format: {file_ext}, supported formats: {self.supported_extensions}")
return resolved_path
def load_data(self, file_path: str, sheet_name: str = None) -> pd.DataFrame:
"""Load data from an Excel or CSV file"""
file_path = self._validate_file(file_path)
file_ext = os.path.splitext(file_path)[1].lower()
try:
if file_ext == '.csv':
encoding = detect_encoding(file_path)
df = pd.read_csv(file_path, encoding=encoding)
else:
# Excel file
if sheet_name:
df = pd.read_excel(file_path, sheet_name=sheet_name)
else:
# Read the first sheet
df = pd.read_excel(file_path)
# Handle empty values
df = df.fillna('')
return df
except Exception as e:
raise ValueError(f"File loading failed: {str(e)}")
def get_sheets(self, file_path: str) -> List[str]:
"""Get all sheet names from an Excel file"""
file_path = self._validate_file(file_path)
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext == '.csv':
return ['default'] # A CSV file has only one default sheet
try:
excel_file = pd.ExcelFile(file_path)
return excel_file.sheet_names
except Exception as e:
raise ValueError(f"Failed to read Excel sheet list: {str(e)}")
def get_schema(self, file_path: str, sheet_name: str = None) -> List[str]:
"""Get the list of schema fields for the file"""
try:
df = self.load_data(file_path, sheet_name)
return df.columns.tolist()
except Exception as e:
raise ValueError(f"Failed to get schema: {str(e)}")
def full_text_search(self, file_path: str, keywords: List[str],
top_k: int = 10, case_sensitive: bool = False) -> str:
"""Full-text search"""
if not keywords:
return "Error: Keyword list cannot be empty"
# Preprocess and validate regex expressions in keywords
valid_keywords = []
regex_errors = []
for keyword in keywords:
compiled = compile_pattern(keyword)
if compiled is None:
regex_errors.append(keyword)
else:
valid_keywords.append(keyword)
if regex_errors:
error_msg = f"Warning: The following regular expressions failed to compile and will be ignored: {', '.join(regex_errors)}"
print(error_msg)
if not valid_keywords:
return "Error: No valid search keywords"
try:
# Validate the file path
validated_path = self._validate_file(file_path)
file_ext = os.path.splitext(validated_path)[1].lower()
all_results = []
if file_ext == '.csv':
# A CSV file has only one dataset
results = self._search_in_file(validated_path, valid_keywords, case_sensitive, 'default')
all_results.extend(results)
else:
# For Excel files, search all sheets
sheets = self.get_sheets(validated_path)
for sheet in sheets:
results = self._search_in_file(validated_path, valid_keywords, case_sensitive, sheet)
all_results.extend(results)
# Sort by match count in descending order
all_results.sort(key=lambda x: x['match_count'], reverse=True)
# Limit the number of results
limited_results = all_results[:top_k]
# Format output as CSV
if not limited_results:
return "No matching results found"
# Build CSV-formatted output
csv_lines = []
headers = ["sheet", "row_index", "match_count", "matched_content", "match_details"]
csv_lines.append(",".join(headers))
for result in limited_results:
# Escape special characters in CSV
sheet = str(result.get('sheet', '')).replace(',', '')
row_index = str(result.get('row_index', ''))
match_count = str(result.get('match_count', 0))
matched_content = str(result.get('matched_content', '')).replace(',', '').replace('\n', ' ')
match_details = str(result.get('match_details', '')).replace(',', '')
csv_lines.append(f"{sheet},{row_index},{match_count},{matched_content},{match_details}")
return "\n".join(csv_lines)
except Exception as e:
return f"Search failed: {str(e)}"
def _search_in_file(self, file_path: str, keywords: List[str],
case_sensitive: bool, sheet_name: str = None) -> List[Dict[str, Any]]:
"""Search for keywords in a file"""
results = []
try:
df = self.load_data(file_path, sheet_name)
# Preprocess all patterns
processed_patterns = []
for keyword in keywords:
compiled = compile_pattern(keyword)
if compiled is not None:
processed_patterns.append({
'original': keyword,
'pattern': compiled,
'is_regex': isinstance(compiled, re.Pattern)
})
# Search row by row
for row_index, row in df.iterrows():
# Combine the full row into a string for searching
row_content = " ".join([str(cell) for cell in row.values if str(cell).strip()])
search_content = row_content if case_sensitive else row_content.lower()
# Count matched patterns
matched_patterns = []
for pattern_info in processed_patterns:
pattern = pattern_info['pattern']
is_regex = pattern_info['is_regex']
match_found = False
match_details = None
if is_regex:
# Regex match
if case_sensitive:
match = pattern.search(row_content)
else:
# For case-insensitive regex matching, recompile the pattern
if isinstance(pattern, re.Pattern):
flags = pattern.flags | re.IGNORECASE
case_insensitive_pattern = re.compile(pattern.pattern, flags)
match = case_insensitive_pattern.search(row_content)
else:
match = pattern.search(search_content)
if match:
match_found = True
match_details = match.group(0)
else:
# Plain string match
search_keyword = pattern if case_sensitive else pattern.lower()
if search_keyword in search_content:
match_found = True
match_details = pattern
if match_found:
matched_patterns.append({
'original': pattern_info['original'],
'type': 'regex' if is_regex else 'keyword',
'match': match_details
})
match_count = len(matched_patterns)
if match_count > 0:
# Build match details
match_details = []
for pattern in matched_patterns:
if pattern['type'] == 'regex':
match_details.append(f"[regex:{pattern['original']}={pattern['match']}]")
else:
match_details.append(f"[keyword:{pattern['match']}]")
match_info = " ".join(match_details)
results.append({
'sheet': sheet_name,
'row_index': row_index,
'match_count': match_count,
'matched_content': row_content,
'match_details': match_info
})
except Exception as e:
print(f"Error searching file {file_path} (sheet: {sheet_name}): {str(e)}")
return results
def filter_search(self, file_path: str, filters: Dict,
sheet_name: str = None) -> str:
"""Field-based filter search"""
if not filters:
return "Error: Filter conditions cannot be empty"
try:
df = self.load_data(file_path, sheet_name)
# Apply filter conditions
filtered_df = df.copy()
for field_name, filter_condition in filters.items():
if field_name not in df.columns:
return f"Error: Field '{field_name}' does not exist"
operator = filter_condition.get('operator', 'eq')
value = filter_condition.get('value')
if operator == 'eq':
# Equal to
filtered_df = filtered_df[filtered_df[field_name] == value]
elif operator == 'gt':
# Greater than
filtered_df = filtered_df[filtered_df[field_name] > value]
elif operator == 'lt':
# Less than
filtered_df = filtered_df[filtered_df[field_name] < value]
elif operator == 'gte':
# Greater than or equal to
filtered_df = filtered_df[filtered_df[field_name] >= value]
elif operator == 'lte':
# Less than or equal to
filtered_df = filtered_df[filtered_df[field_name] <= value]
elif operator == 'contains':
# Contains
filtered_df = filtered_df[filtered_df[field_name].astype(str).str.contains(str(value), na=False)]
elif operator == 'regex':
# Regular expression
try:
pattern = re.compile(str(value))
filtered_df = filtered_df[filtered_df[field_name].astype(str).str.match(pattern, na=False)]
except re.error as e:
return f"Error: Regular expression '{value}' compilation failed: {str(e)}"
else:
return f"Error: Unsupported operator '{operator}'"
# Format output as CSV
if filtered_df.empty:
return "No records matching conditions found"
# Convert to a CSV string
csv_result = filtered_df.to_csv(index=False, encoding='utf-8')
return csv_result
except Exception as e:
return f"Filter search failed: {str(e)}"
def get_field_enums(self, file_path: str, field_names: List[str],
sheet_name: str = None, max_enum_count: int = 100,
min_occurrence: int = 1) -> str:
"""Get enum values for the specified fields"""
if not field_names:
return "Error: Field name list cannot be empty"
try:
df = self.load_data(file_path, sheet_name)
# Validate that fields exist
missing_fields = [field for field in field_names if field not in df.columns]
if missing_fields:
return f"Error: Fields do not exist: {', '.join(missing_fields)}"
# Compute enum values for each field
enum_results = {}
for field in field_names:
# Count occurrences of each value
value_counts = df[field].value_counts()
# Filter out values with too few occurrences
filtered_counts = value_counts[value_counts >= min_occurrence]
# Limit the number of returned values
top_values = filtered_counts.head(max_enum_count)
# Format results
enum_values = []
for value, count in top_values.items():
enum_values.append(f"{value}({count})")
enum_results[field] = {
'enum_values': enum_values,
'total_unique': len(value_counts),
'total_filtered': len(filtered_counts),
'total_rows': len(df)
}
# Format output
output_lines = []
for field, data in enum_results.items():
enum_str = ", ".join(data['enum_values'])
field_info = f"{field}: [{enum_str}] (total: {data['total_unique']} unique values, after filtering: {data['total_filtered']}, total rows: {data['total_rows']})"
output_lines.append(field_info)
return "\n".join(output_lines)
except Exception as e:
return f"Failed to get enum values: {str(e)}"
# Global operator instance
operator = ExcelCSVOperator()
async def handle_request(request: Dict[str, Any]) -> Dict[str, Any]:
"""Handle MCP request"""
try:
method = request.get("method")
params = request.get("params", {})
request_id = request.get("id")
if method == "initialize":
return create_initialize_response(request_id, "excel-csv-operator")
elif method == "ping":
return create_ping_response(request_id)
elif method == "tools/list":
# Load tool definitions from the JSON file
tools = load_tools_from_json("excel_csv_operator_tools.json")
return create_tools_list_response(request_id, tools)
elif method == "tools/call":
tool_name = params.get("name")
arguments = params.get("arguments", {})
if tool_name == "get_excel_sheets":
file_path = arguments.get("file_path")
result = operator.get_sheets(file_path)
return create_success_response(request_id, {
"content": [
{
"type": "text",
"text": json.dumps(result, ensure_ascii=False, indent=2)
}
]
})
elif tool_name == "get_table_schema":
file_path = arguments.get("file_path")
sheet_name = arguments.get("sheet_name")
result = operator.get_schema(file_path, sheet_name)
return create_success_response(request_id, {
"content": [
{
"type": "text",
"text": json.dumps(result, ensure_ascii=False, indent=2)
}
]
})
elif tool_name == "full_text_search":
file_path = arguments.get("file_path")
keywords = arguments.get("keywords", [])
top_k = arguments.get("top_k", 10)
case_sensitive = arguments.get("case_sensitive", False)
result = operator.full_text_search(file_path, keywords, top_k, case_sensitive)
return create_success_response(request_id, {
"content": [
{
"type": "text",
"text": result
}
]
})
elif tool_name == "filter_search":
file_path = arguments.get("file_path")
sheet_name = arguments.get("sheet_name")
filters = arguments.get("filters")
result = operator.filter_search(file_path, filters, sheet_name)
return create_success_response(request_id, {
"content": [
{
"type": "text",
"text": result
}
]
})
elif tool_name == "get_field_enums":
file_path = arguments.get("file_path")
sheet_name = arguments.get("sheet_name")
field_names = arguments.get("field_names", [])
max_enum_count = arguments.get("max_enum_count", 100)
min_occurrence = arguments.get("min_occurrence", 1)
result = operator.get_field_enums(file_path, field_names, sheet_name, max_enum_count, min_occurrence)
return create_success_response(request_id, {
"content": [
{
"type": "text",
"text": result
}
]
})
else:
return create_error_response(request_id, -32601, f"Unknown tool: {tool_name}")
else:
return create_error_response(request_id, -32601, f"Unknown method: {method}")
except Exception as e:
return create_error_response(request.get("id"), -32603, f"Internal error: {str(e)}")
async def main():
"""Main entry point."""
await handle_mcp_streaming(handle_request)
if __name__ == "__main__":
asyncio.run(main())