add csv.py
This commit is contained in:
parent
b8e1414a94
commit
bd4435c1ec
@ -16,6 +16,9 @@ from utils.file_utils import (
|
|||||||
load_processed_files_log, save_processed_files_log,
|
load_processed_files_log, save_processed_files_log,
|
||||||
remove_file_or_directory
|
remove_file_or_directory
|
||||||
)
|
)
|
||||||
|
from utils.excel_csv_processor import (
|
||||||
|
is_excel_file, is_csv_file, process_excel_file, process_csv_file
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def download_dataset_files(unique_id: str, files: Dict[str, List[str]]) -> Dict[str, List[str]]:
|
async def download_dataset_files(unique_id: str, files: Dict[str, List[str]]) -> Dict[str, List[str]]:
|
||||||
@ -43,13 +46,13 @@ async def download_dataset_files(unique_id: str, files: Dict[str, List[str]]) ->
|
|||||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
||||||
zip_ref.extractall(extract_dir)
|
zip_ref.extractall(extract_dir)
|
||||||
|
|
||||||
# Find all extracted txt and md files
|
# Find all extracted txt, md, xlsx, xls, and csv files
|
||||||
for root, dirs, files in os.walk(extract_dir):
|
for root, dirs, files in os.walk(extract_dir):
|
||||||
for file in files:
|
for file in files:
|
||||||
if file.lower().endswith(('.txt', '.md')):
|
if file.lower().endswith(('.txt', '.md', '.xlsx', '.xls', '.csv')):
|
||||||
extracted_files.append(os.path.join(root, file))
|
extracted_files.append(os.path.join(root, file))
|
||||||
|
|
||||||
print(f"Extracted {len(extracted_files)} txt/md files from {zip_path}")
|
print(f"Extracted {len(extracted_files)} txt/md/xlsx/csv files from {zip_path}")
|
||||||
return extracted_files
|
return extracted_files
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -86,6 +89,7 @@ async def download_dataset_files(unique_id: str, files: Dict[str, List[str]]) ->
|
|||||||
|
|
||||||
# Read and combine all files for this key
|
# Read and combine all files for this key
|
||||||
combined_content = []
|
combined_content = []
|
||||||
|
pagination_lines = [] # Collect pagination lines from all files
|
||||||
all_processed_files = []
|
all_processed_files = []
|
||||||
|
|
||||||
for file_path in file_list:
|
for file_path in file_list:
|
||||||
@ -151,12 +155,39 @@ async def download_dataset_files(unique_id: str, files: Dict[str, List[str]]) ->
|
|||||||
# Process all files (extracted from zip or single file)
|
# Process all files (extracted from zip or single file)
|
||||||
for process_file_path in files_to_process:
|
for process_file_path in files_to_process:
|
||||||
try:
|
try:
|
||||||
|
base_filename = os.path.basename(process_file_path)
|
||||||
|
|
||||||
|
# Check if it's an Excel file
|
||||||
|
if is_excel_file(process_file_path):
|
||||||
|
print(f"Processing Excel file: {base_filename}")
|
||||||
|
document_content, excel_pagination_lines = process_excel_file(process_file_path)
|
||||||
|
|
||||||
|
if document_content:
|
||||||
|
combined_content.append(f"# Page {base_filename}")
|
||||||
|
combined_content.append(document_content)
|
||||||
|
|
||||||
|
# Collect pagination lines from Excel files
|
||||||
|
pagination_lines.extend(excel_pagination_lines)
|
||||||
|
|
||||||
|
# Check if it's a CSV file
|
||||||
|
elif is_csv_file(process_file_path):
|
||||||
|
print(f"Processing CSV file: {base_filename}")
|
||||||
|
document_content, csv_pagination_lines = process_csv_file(process_file_path)
|
||||||
|
|
||||||
|
if document_content:
|
||||||
|
combined_content.append(f"# Page {base_filename}")
|
||||||
|
combined_content.append(document_content)
|
||||||
|
|
||||||
|
# Collect pagination lines from CSV files
|
||||||
|
pagination_lines.extend(csv_pagination_lines)
|
||||||
|
|
||||||
|
# Handle text files (original logic)
|
||||||
|
else:
|
||||||
with open(process_file_path, 'r', encoding='utf-8') as f:
|
with open(process_file_path, 'r', encoding='utf-8') as f:
|
||||||
content = f.read().strip()
|
content = f.read().strip()
|
||||||
|
|
||||||
if content:
|
if content:
|
||||||
# Add file content with page separator
|
# Add file content with page separator
|
||||||
base_filename = os.path.basename(process_file_path)
|
|
||||||
combined_content.append(f"# Page {base_filename}")
|
combined_content.append(f"# Page {base_filename}")
|
||||||
combined_content.append(content)
|
combined_content.append(content)
|
||||||
|
|
||||||
@ -186,10 +217,22 @@ async def download_dataset_files(unique_id: str, files: Dict[str, List[str]]) ->
|
|||||||
try:
|
try:
|
||||||
import sys
|
import sys
|
||||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'embedding'))
|
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'embedding'))
|
||||||
from embedding import split_document_by_pages, embed_document
|
from embedding import embed_document
|
||||||
|
|
||||||
# Generate pagination
|
# Generate pagination file from collected pagination lines
|
||||||
print(f" Generating pagination for {key}")
|
# For Excel/CSV files, use the pagination format we collected
|
||||||
|
# For text files, fall back to the original pagination generation
|
||||||
|
if pagination_lines:
|
||||||
|
print(f" Writing pagination data from Excel/CSV files for {key}")
|
||||||
|
with open(pagination_file, 'w', encoding='utf-8') as f:
|
||||||
|
for line in pagination_lines:
|
||||||
|
if line.strip():
|
||||||
|
f.write(f"{line}\n")
|
||||||
|
print(f" Generated {len(pagination_lines)} pagination lines")
|
||||||
|
else:
|
||||||
|
# For text-only files, use the original pagination generation
|
||||||
|
from embedding import split_document_by_pages
|
||||||
|
print(f" Generating pagination from text files for {key}")
|
||||||
pages = split_document_by_pages(str(document_file), str(pagination_file))
|
pages = split_document_by_pages(str(document_file), str(pagination_file))
|
||||||
print(f" Generated {len(pages)} pages")
|
print(f" Generated {len(pages)} pages")
|
||||||
|
|
||||||
|
|||||||
253
utils/excel_csv_processor.py
Normal file
253
utils/excel_csv_processor.py
Normal file
@ -0,0 +1,253 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Excel and CSV file processor for converting data to document.txt and pagination.txt formats.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pandas as pd
|
||||||
|
from typing import List, Dict, Any, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
def read_excel_sheets(file_path: str) -> Dict[str, List[Dict[str, Any]]]:
|
||||||
|
"""
|
||||||
|
读取 Excel 文件的所有 sheet
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Excel 文件路径
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict: 键为 sheet 名称,值为该 sheet 的数据列表
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 读取所有 sheet
|
||||||
|
excel_file = pd.ExcelFile(file_path)
|
||||||
|
sheets_data = {}
|
||||||
|
|
||||||
|
for sheet_name in excel_file.sheet_names:
|
||||||
|
try:
|
||||||
|
# 读取每个 sheet 的数据
|
||||||
|
df = pd.read_excel(file_path, sheet_name=sheet_name)
|
||||||
|
|
||||||
|
# 转换为字典列表,跳过 NaN 值
|
||||||
|
sheet_data = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
# 将 NaN 值转换为空字符串
|
||||||
|
row_dict = {}
|
||||||
|
for col in df.columns:
|
||||||
|
value = row[col]
|
||||||
|
if pd.isna(value):
|
||||||
|
value = ""
|
||||||
|
elif isinstance(value, (int, float)):
|
||||||
|
value = str(value)
|
||||||
|
else:
|
||||||
|
value = str(value).strip()
|
||||||
|
row_dict[str(col)] = value
|
||||||
|
|
||||||
|
# 只添加非空行
|
||||||
|
if any(v.strip() for v in row_dict.values()):
|
||||||
|
sheet_data.append(row_dict)
|
||||||
|
|
||||||
|
sheets_data[sheet_name] = sheet_data
|
||||||
|
print(f"读取 Excel sheet '{sheet_name}': {len(sheet_data)} 行数据")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"读取 Excel sheet '{sheet_name}' 失败: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return sheets_data
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"读取 Excel 文件失败: {str(e)}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def read_csv_file(file_path: str, encoding: str = 'utf-8') -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
读取 CSV 文件
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: CSV 文件路径
|
||||||
|
encoding: 文件编码
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List: CSV 数据列表
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# 尝试不同编码
|
||||||
|
encodings_to_try = [encoding, 'utf-8', 'gbk', 'gb2312', 'utf-8-sig']
|
||||||
|
|
||||||
|
for enc in encodings_to_try:
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(file_path, encoding=enc)
|
||||||
|
break
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# 如果所有编码都失败,使用默认编码并忽略错误
|
||||||
|
df = pd.read_csv(file_path, encoding='utf-8', errors='ignore')
|
||||||
|
|
||||||
|
# 转换为字典列表,跳过 NaN 值
|
||||||
|
csv_data = []
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
# 将 NaN 值转换为空字符串
|
||||||
|
row_dict = {}
|
||||||
|
for col in df.columns:
|
||||||
|
value = row[col]
|
||||||
|
if pd.isna(value):
|
||||||
|
value = ""
|
||||||
|
elif isinstance(value, (int, float)):
|
||||||
|
value = str(value)
|
||||||
|
else:
|
||||||
|
value = str(value).strip()
|
||||||
|
row_dict[str(col)] = value
|
||||||
|
|
||||||
|
# 只添加非空行
|
||||||
|
if any(v.strip() for v in row_dict.values()):
|
||||||
|
csv_data.append(row_dict)
|
||||||
|
|
||||||
|
print(f"读取 CSV 文件: {len(csv_data)} 行数据")
|
||||||
|
return csv_data
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"读取 CSV 文件失败: {str(e)}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_markdown_format(data_list: List[Dict[str, Any]], sheet_name: str = "") -> str:
|
||||||
|
"""
|
||||||
|
将数据转换为指定的 markdown 格式
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_list: 数据列表
|
||||||
|
sheet_name: sheet 名称(可选)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: markdown 格式的文本
|
||||||
|
"""
|
||||||
|
if not data_list:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
markdown_content = []
|
||||||
|
|
||||||
|
# 添加 sheet 标题
|
||||||
|
if sheet_name:
|
||||||
|
markdown_content.append(f"# Sheet: {sheet_name}")
|
||||||
|
markdown_content.append("")
|
||||||
|
|
||||||
|
for i, row_data in enumerate(data_list, 1):
|
||||||
|
# 为每行数据生成 markdown 格式
|
||||||
|
row_markdown = []
|
||||||
|
|
||||||
|
for key, value in row_data.items():
|
||||||
|
if value and value.strip(): # 只包含非空值
|
||||||
|
row_markdown.append(f"{key}: {value.strip()}")
|
||||||
|
|
||||||
|
if row_markdown:
|
||||||
|
markdown_content.extend(row_markdown)
|
||||||
|
|
||||||
|
# 在行之间添加分隔符,除了最后一行
|
||||||
|
if i < len(data_list):
|
||||||
|
markdown_content.append("---")
|
||||||
|
markdown_content.append("")
|
||||||
|
|
||||||
|
return "\n".join(markdown_content)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_pagination_format(data_list: List[Dict[str, Any]]) -> List[str]:
|
||||||
|
"""
|
||||||
|
将数据转换为 key:value;key:value 格式
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_list: 数据列表
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List: pagination 格式的文本列表
|
||||||
|
"""
|
||||||
|
if not data_list:
|
||||||
|
return []
|
||||||
|
|
||||||
|
pagination_lines = []
|
||||||
|
|
||||||
|
for row_data in data_list:
|
||||||
|
# 为每行数据生成 pagination 格式
|
||||||
|
row_pairs = []
|
||||||
|
|
||||||
|
for key, value in row_data.items():
|
||||||
|
if value and value.strip(): # 只包含非空值
|
||||||
|
# 去除值中的分号和换行符,避免格式问题
|
||||||
|
clean_value = str(value).replace(';', ',').replace('\n', ' ').strip()
|
||||||
|
if clean_value:
|
||||||
|
row_pairs.append(f"{key}:{clean_value}")
|
||||||
|
|
||||||
|
if row_pairs:
|
||||||
|
pagination_line = ";".join(row_pairs)
|
||||||
|
pagination_lines.append(pagination_line)
|
||||||
|
|
||||||
|
return pagination_lines
|
||||||
|
|
||||||
|
|
||||||
|
def process_excel_file(file_path: str) -> Tuple[str, List[str]]:
|
||||||
|
"""
|
||||||
|
处理 Excel 文件,生成 document.txt 和 pagination.txt 内容
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Excel 文件路径
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple: (document_content, pagination_lines)
|
||||||
|
"""
|
||||||
|
sheets_data = read_excel_sheets(file_path)
|
||||||
|
|
||||||
|
document_content_parts = []
|
||||||
|
pagination_lines = []
|
||||||
|
|
||||||
|
# 处理每个 sheet
|
||||||
|
for sheet_name, sheet_data in sheets_data.items():
|
||||||
|
if sheet_data:
|
||||||
|
# 生成 markdown 格式的文档内容
|
||||||
|
markdown_content = convert_to_markdown_format(sheet_data, sheet_name)
|
||||||
|
if markdown_content:
|
||||||
|
document_content_parts.append(markdown_content)
|
||||||
|
|
||||||
|
# 生成 pagination 格式内容
|
||||||
|
sheet_pagination_lines = convert_to_pagination_format(sheet_data)
|
||||||
|
pagination_lines.extend(sheet_pagination_lines)
|
||||||
|
|
||||||
|
# 合并所有 sheet 的文档内容
|
||||||
|
document_content = "\n\n".join(document_content_parts)
|
||||||
|
|
||||||
|
return document_content, pagination_lines
|
||||||
|
|
||||||
|
|
||||||
|
def process_csv_file(file_path: str) -> Tuple[str, List[str]]:
|
||||||
|
"""
|
||||||
|
处理 CSV 文件,生成 document.txt 和 pagination.txt 内容
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: CSV 文件路径
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple: (document_content, pagination_lines)
|
||||||
|
"""
|
||||||
|
csv_data = read_csv_file(file_path)
|
||||||
|
|
||||||
|
if not csv_data:
|
||||||
|
return "", []
|
||||||
|
|
||||||
|
# 生成 markdown 格式的文档内容
|
||||||
|
document_content = convert_to_markdown_format(csv_data)
|
||||||
|
|
||||||
|
# 生成 pagination 格式内容
|
||||||
|
pagination_lines = convert_to_pagination_format(csv_data)
|
||||||
|
|
||||||
|
return document_content, pagination_lines
|
||||||
|
|
||||||
|
|
||||||
|
def is_excel_file(file_path: str) -> bool:
|
||||||
|
"""检查文件是否为 Excel 文件"""
|
||||||
|
return file_path.lower().endswith(('.xlsx', '.xls'))
|
||||||
|
|
||||||
|
|
||||||
|
def is_csv_file(file_path: str) -> bool:
|
||||||
|
"""检查文件是否为 CSV 文件"""
|
||||||
|
return file_path.lower().endswith('.csv')
|
||||||
@ -59,13 +59,13 @@ def extract_zip_file(zip_path: str, extract_dir: str) -> List[str]:
|
|||||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
||||||
zip_ref.extractall(extract_dir)
|
zip_ref.extractall(extract_dir)
|
||||||
|
|
||||||
# Find all extracted txt and md files
|
# Find all extracted txt, md, xlsx, xls, and csv files
|
||||||
for root, dirs, files in os.walk(extract_dir):
|
for root, dirs, files in os.walk(extract_dir):
|
||||||
for file in files:
|
for file in files:
|
||||||
if file.lower().endswith(('.txt', '.md')):
|
if file.lower().endswith(('.txt', '.md', '.xlsx', '.xls', '.csv')):
|
||||||
extracted_files.append(os.path.join(root, file))
|
extracted_files.append(os.path.join(root, file))
|
||||||
|
|
||||||
print(f"Extracted {len(extracted_files)} txt/md files from {zip_path}")
|
print(f"Extracted {len(extracted_files)} txt/md/xlsx/csv files from {zip_path}")
|
||||||
return extracted_files
|
return extracted_files
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user