日志优化
This commit is contained in:
parent
116f1cb471
commit
3973174c83
17
README.md
17
README.md
@ -668,3 +668,20 @@ qwen-agent/
|
||||
```
|
||||
|
||||
现在您可以开始使用 Catalog Agent 进行智能数据检索了!🚀
|
||||
|
||||
|
||||
|
||||
## 提示词工程
|
||||
|
||||
指南
|
||||
```guideline
|
||||
1) Condition: 用户意图为“执行设备控制”。 Action: 如果用户在当前交互中已回复肯定确认(例如,同意、确认、好的等),则立即调用 Iot Control-dxcore_update_device_status 工具执行设备操作。否则(用户尚未确认),则向用户发送确认询问:“即将为您[操作内容][设备名称][具体参数],是否确认?”并等待用户回复。
|
||||
3) Condition: 当用户意图为查询设备状态或信息时。 Action: 调用 Iot Control 设备相关的工具进行查询(不调用 rag_retrieve-rag_retrieve)。
|
||||
4) Condition: 当用户意图为非设备相关的问题(如提问、遇到难处、咨询事实性问题、价格等)时。 Action: 优先调用 rag_retrieve-rag_retrieve 查询知识库;如果无结果,则调用 WebSearch-web_search;最后综合两个工具的内容进行回复。
|
||||
```
|
||||
术语表
|
||||
```terms
|
||||
1) Name: term_name1, Description: desc, Synonyms: syn1, syn2
|
||||
2) Name: term_name2, Description: desc, Synonyms: syn1, syn2
|
||||
```
|
||||
|
||||
|
||||
@ -7,7 +7,9 @@ import asyncio
|
||||
import os
|
||||
import json
|
||||
from typing import Dict, Tuple, Optional, Any
|
||||
from utils.logger import logger
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
|
||||
class ConfigFileCache:
|
||||
@ -97,4 +99,4 @@ class ConfigFileCache:
|
||||
|
||||
|
||||
# 全局缓存实例
|
||||
config_cache = ConfigFileCache()
|
||||
config_cache = ConfigFileCache()
|
||||
|
||||
@ -24,7 +24,9 @@ from typing import Dict, Optional, Union
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from qwen_agent.log import logger
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger('app')
|
||||
from qwen_agent.tools.base import BaseTool
|
||||
|
||||
|
||||
|
||||
@ -21,7 +21,9 @@ import asyncio
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from qwen_agent.agents import Assistant
|
||||
from qwen_agent.log import logger
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
from agent.modified_assistant import init_modified_agent_service_with_files, update_agent_llm
|
||||
from agent.prompt_loader import load_system_prompt_async, load_mcp_settings_async
|
||||
|
||||
@ -24,9 +24,11 @@ from qwen_agent.llm.schema import ASSISTANT, FUNCTION, Message
|
||||
from qwen_agent.llm.oai import TextChatAtOAI
|
||||
from qwen_agent.tools import BaseTool
|
||||
from agent.custom_mcp_manager import CustomMCPManager
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger('app')
|
||||
# 设置工具日志记录器
|
||||
tool_logger = logging.getLogger('tool_logger')
|
||||
tool_logger = logging.getLogger('app')
|
||||
|
||||
class ModifiedAssistant(Assistant):
|
||||
"""
|
||||
|
||||
@ -7,6 +7,9 @@ import json
|
||||
import asyncio
|
||||
from typing import List, Dict, Optional, Any
|
||||
from datetime import datetime, timezone, timedelta
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
|
||||
def safe_replace(text: str, placeholder: str, value: Any) -> str:
|
||||
@ -142,9 +145,9 @@ async def load_system_prompt_async(project_dir: str, language: str = None, syste
|
||||
default_prompt_file = os.path.join("prompt", f"system_prompt_{robot_type}.md")
|
||||
system_prompt_default = await config_cache.get_text_file(default_prompt_file)
|
||||
if system_prompt_default:
|
||||
print(f"Using cached default system prompt for {robot_type} from prompt folder")
|
||||
logger.info(f"Using cached default system prompt for {robot_type} from prompt folder")
|
||||
except Exception as e:
|
||||
print(f"Failed to load default system prompt for {robot_type}: {str(e)}")
|
||||
logger.error(f"Failed to load default system prompt for {robot_type}: {str(e)}")
|
||||
system_prompt_default = None
|
||||
|
||||
readme = ""
|
||||
@ -244,11 +247,11 @@ async def load_mcp_settings_async(project_dir: str, mcp_settings: list=None, bot
|
||||
default_mcp_file = os.path.join("mcp", f"mcp_settings_{robot_type}.json")
|
||||
default_mcp_settings = await config_cache.get_json_file(default_mcp_file) or []
|
||||
if default_mcp_settings:
|
||||
print(f"Using cached default mcp_settings_{robot_type} from mcp folder")
|
||||
logger.info(f"Using cached default mcp_settings_{robot_type} from mcp folder")
|
||||
else:
|
||||
print(f"No default mcp_settings_{robot_type} found, using empty default settings")
|
||||
logger.warning(f"No default mcp_settings_{robot_type} found, using empty default settings")
|
||||
except Exception as e:
|
||||
print(f"Failed to load default mcp_settings_{robot_type}: {str(e)}")
|
||||
logger.error(f"Failed to load default mcp_settings_{robot_type}: {str(e)}")
|
||||
default_mcp_settings = []
|
||||
|
||||
# 遍历mcpServers工具,给每个工具增加env参数
|
||||
@ -269,7 +272,7 @@ async def load_mcp_settings_async(project_dir: str, mcp_settings: list=None, bot
|
||||
input_mcp_settings = mcp_settings
|
||||
elif mcp_settings:
|
||||
input_mcp_settings = [mcp_settings]
|
||||
print(f"Warning: mcp_settings is not a list, converting to list format")
|
||||
logger.warning(f"Warning: mcp_settings is not a list, converting to list format")
|
||||
|
||||
# 3. 合并默认设置和传入设置
|
||||
merged_settings = []
|
||||
@ -286,7 +289,7 @@ async def load_mcp_settings_async(project_dir: str, mcp_settings: list=None, bot
|
||||
# 合并mcpServers对象,传入的设置覆盖默认设置中相同的key
|
||||
default_mcp_servers.update(input_mcp_servers)
|
||||
merged_settings[0]['mcpServers'] = default_mcp_servers
|
||||
print(f"Merged mcpServers: default + {len(input_mcp_servers)} input servers")
|
||||
logger.info(f"Merged mcpServers: default + {len(input_mcp_servers)} input servers")
|
||||
|
||||
# 如果没有默认设置但有传入设置,直接使用传入设置
|
||||
elif input_mcp_settings:
|
||||
@ -296,7 +299,7 @@ async def load_mcp_settings_async(project_dir: str, mcp_settings: list=None, bot
|
||||
if not merged_settings:
|
||||
merged_settings = []
|
||||
elif not isinstance(merged_settings, list):
|
||||
print(f"Warning: merged_settings is not a list, converting to list format")
|
||||
logger.warning(f"Warning: merged_settings is not a list, converting to list format")
|
||||
merged_settings = [merged_settings] if merged_settings else []
|
||||
|
||||
# 计算 dataset_dir 用于替换 MCP 配置中的占位符
|
||||
|
||||
@ -24,7 +24,9 @@ import threading
|
||||
from collections import defaultdict
|
||||
|
||||
from qwen_agent.agents import Assistant
|
||||
from qwen_agent.log import logger
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
from agent.modified_assistant import init_modified_agent_service_with_files, update_agent_llm
|
||||
from agent.prompt_loader import load_system_prompt_async, load_mcp_settings_async
|
||||
|
||||
@ -7,6 +7,10 @@ import requests
|
||||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
|
||||
# Configure logger
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
def encode_texts_via_api(texts, batch_size=32):
|
||||
"""通过 API 接口编码文本"""
|
||||
@ -35,18 +39,18 @@ def encode_texts_via_api(texts, batch_size=32):
|
||||
|
||||
if result_data.get("success"):
|
||||
embeddings_list = result_data.get("embeddings", [])
|
||||
print(f"API编码成功,处理了 {len(texts)} 个文本,embedding维度: {len(embeddings_list[0]) if embeddings_list else 0}")
|
||||
logger.info(f"API编码成功,处理了 {len(texts)} 个文本,embedding维度: {len(embeddings_list[0]) if embeddings_list else 0}")
|
||||
return np.array(embeddings_list)
|
||||
else:
|
||||
error_msg = result_data.get('error', '未知错误')
|
||||
print(f"API编码失败: {error_msg}")
|
||||
logger.error(f"API编码失败: {error_msg}")
|
||||
raise Exception(f"API编码失败: {error_msg}")
|
||||
else:
|
||||
print(f"API请求失败: {response.status_code} - {response.text}")
|
||||
logger.error(f"API请求失败: {response.status_code} - {response.text}")
|
||||
raise Exception(f"API请求失败: {response.status_code}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"API编码异常: {e}")
|
||||
logger.error(f"API编码异常: {e}")
|
||||
raise
|
||||
|
||||
def clean_text(text):
|
||||
@ -144,10 +148,10 @@ def embed_document(input_file='document.txt', output_file='embedding.pkl',
|
||||
if is_meaningful_line(cleaned_text):
|
||||
chunks.append(cleaned_text)
|
||||
|
||||
print(f"使用按行分块策略")
|
||||
print(f"原始行数: {original_count}")
|
||||
print(f"清理后有效句子数: {len(chunks)}")
|
||||
print(f"过滤比例: {((original_count - len(chunks)) / original_count * 100):.1f}%")
|
||||
logger.info(f"使用按行分块策略")
|
||||
logger.info(f"原始行数: {original_count}")
|
||||
logger.info(f"清理后有效句子数: {len(chunks)}")
|
||||
logger.info(f"过滤比例: {((original_count - len(chunks)) / original_count * 100):.1f}%")
|
||||
|
||||
elif chunking_strategy == 'paragraph':
|
||||
# 新的段落级分块策略
|
||||
@ -166,13 +170,13 @@ def embed_document(input_file='document.txt', output_file='embedding.pkl',
|
||||
# 使用段落分块
|
||||
chunks = paragraph_chunking(cleaned_content, **params)
|
||||
|
||||
print(f"使用段落级分块策略")
|
||||
print(f"文档总长度: {len(content)} 字符")
|
||||
print(f"分块数量: {len(chunks)}")
|
||||
logger.info(f"使用段落级分块策略")
|
||||
logger.info(f"文档总长度: {len(content)} 字符")
|
||||
logger.info(f"分块数量: {len(chunks)}")
|
||||
if chunks:
|
||||
print(f"平均chunk大小: {sum(len(chunk) for chunk in chunks) / len(chunks):.1f} 字符")
|
||||
print(f"最大chunk大小: {max(len(chunk) for chunk in chunks)} 字符")
|
||||
print(f"最小chunk大小: {min(len(chunk) for chunk in chunks)} 字符")
|
||||
logger.debug(f"平均chunk大小: {sum(len(chunk) for chunk in chunks) / len(chunks):.1f} 字符")
|
||||
logger.debug(f"最大chunk大小: {max(len(chunk) for chunk in chunks)} 字符")
|
||||
logger.debug(f"最小chunk大小: {min(len(chunk) for chunk in chunks)} 字符")
|
||||
|
||||
elif chunking_strategy == 'smart':
|
||||
# 智能分块策略,自动检测文档格式
|
||||
@ -186,25 +190,25 @@ def embed_document(input_file='document.txt', output_file='embedding.pkl',
|
||||
# 使用智能分块
|
||||
chunks = smart_chunking(content, **params)
|
||||
|
||||
print(f"使用智能分块策略")
|
||||
print(f"文档总长度: {len(content)} 字符")
|
||||
print(f"分块数量: {len(chunks)}")
|
||||
logger.info(f"使用智能分块策略")
|
||||
logger.info(f"文档总长度: {len(content)} 字符")
|
||||
logger.info(f"分块数量: {len(chunks)}")
|
||||
if chunks:
|
||||
print(f"平均chunk大小: {sum(len(chunk) for chunk in chunks) / len(chunks):.1f} 字符")
|
||||
print(f"最大chunk大小: {max(len(chunk) for chunk in chunks)} 字符")
|
||||
print(f"最小chunk大小: {min(len(chunk) for chunk in chunks)} 字符")
|
||||
logger.debug(f"平均chunk大小: {sum(len(chunk) for chunk in chunks) / len(chunks):.1f} 字符")
|
||||
logger.debug(f"最大chunk大小: {max(len(chunk) for chunk in chunks)} 字符")
|
||||
logger.debug(f"最小chunk大小: {min(len(chunk) for chunk in chunks)} 字符")
|
||||
|
||||
else:
|
||||
raise ValueError(f"不支持的分块策略: {chunking_strategy}")
|
||||
|
||||
if not chunks:
|
||||
print("警告:没有找到有效的内容块!")
|
||||
logger.warning("警告:没有找到有效的内容块!")
|
||||
return None
|
||||
|
||||
print(f"正在处理 {len(chunks)} 个内容块...")
|
||||
logger.info(f"正在处理 {len(chunks)} 个内容块...")
|
||||
|
||||
# 使用API接口进行编码
|
||||
print("使用API接口进行编码...")
|
||||
logger.info("使用API接口进行编码...")
|
||||
chunk_embeddings = encode_texts_via_api(chunks, batch_size=32)
|
||||
|
||||
embedding_data = {
|
||||
@ -218,14 +222,14 @@ def embed_document(input_file='document.txt', output_file='embedding.pkl',
|
||||
with open(output_file, 'wb') as f:
|
||||
pickle.dump(embedding_data, f)
|
||||
|
||||
print(f"已保存嵌入向量到 {output_file}")
|
||||
logger.info(f"已保存嵌入向量到 {output_file}")
|
||||
return embedding_data
|
||||
|
||||
except FileNotFoundError:
|
||||
print(f"错误:找不到文件 {input_file}")
|
||||
logger.error(f"错误:找不到文件 {input_file}")
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"处理文档时出错:{e}")
|
||||
logger.error(f"处理文档时出错:{e}")
|
||||
return None
|
||||
|
||||
|
||||
@ -611,21 +615,21 @@ def split_document_by_pages(input_file='document.txt', output_file='pagination.t
|
||||
if page_content:
|
||||
pages.append(page_content)
|
||||
|
||||
print(f"总共分割出 {len(pages)} 页")
|
||||
logger.info(f"总共分割出 {len(pages)} 页")
|
||||
|
||||
# 写入序列化文件
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
for i, page_content in enumerate(pages, 1):
|
||||
f.write(f"{page_content}\n")
|
||||
|
||||
print(f"已将页面内容序列化到 {output_file}")
|
||||
logger.info(f"已将页面内容序列化到 {output_file}")
|
||||
return pages
|
||||
|
||||
except FileNotFoundError:
|
||||
print(f"错误:找不到文件 {input_file}")
|
||||
logger.error(f"错误:找不到文件 {input_file}")
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"分割文档时出错:{e}")
|
||||
logger.error(f"分割文档时出错:{e}")
|
||||
return []
|
||||
|
||||
def test_chunking_strategies():
|
||||
@ -645,54 +649,54 @@ def test_chunking_strategies():
|
||||
第五段:这是最后一个段落。它用来测试分块策略的完整性和准确性。
|
||||
"""
|
||||
|
||||
print("=" * 60)
|
||||
print("分块策略测试")
|
||||
print("=" * 60)
|
||||
logger.debug("=" * 60)
|
||||
logger.debug("分块策略测试")
|
||||
logger.debug("=" * 60)
|
||||
|
||||
# 测试1: 段落级分块(小chunk)
|
||||
print("\n1. 段落级分块 - 小chunk (max_size=200):")
|
||||
logger.debug("\n1. 段落级分块 - 小chunk (max_size=200):")
|
||||
chunks_small = paragraph_chunking(test_text, max_chunk_size=200, overlap=50)
|
||||
for i, chunk in enumerate(chunks_small):
|
||||
print(f"Chunk {i+1} (长度: {len(chunk)}): {chunk[:50]}...")
|
||||
logger.debug(f"Chunk {i+1} (长度: {len(chunk)}): {chunk[:50]}...")
|
||||
|
||||
# 测试2: 段落级分块(大chunk)
|
||||
print("\n2. 段落级分块 - 大chunk (max_size=500):")
|
||||
logger.debug("\n2. 段落级分块 - 大chunk (max_size=500):")
|
||||
chunks_large = paragraph_chunking(test_text, max_chunk_size=500, overlap=100)
|
||||
for i, chunk in enumerate(chunks_large):
|
||||
print(f"Chunk {i+1} (长度: {len(chunk)}): {chunk[:50]}...")
|
||||
logger.debug(f"Chunk {i+1} (长度: {len(chunk)}): {chunk[:50]}...")
|
||||
|
||||
# 测试3: 段落级分块(无重叠)
|
||||
print("\n3. 段落级分块 - 无重叠:")
|
||||
logger.debug("\n3. 段落级分块 - 无重叠:")
|
||||
chunks_no_overlap = paragraph_chunking(test_text, max_chunk_size=300, overlap=0)
|
||||
for i, chunk in enumerate(chunks_no_overlap):
|
||||
print(f"Chunk {i+1} (长度: {len(chunk)}): {chunk[:50]}...")
|
||||
logger.debug(f"Chunk {i+1} (长度: {len(chunk)}): {chunk[:50]}...")
|
||||
|
||||
print(f"\n测试总结:")
|
||||
print(f"- 小chunk策略: {len(chunks_small)} 个chunks")
|
||||
print(f"- 大chunk策略: {len(chunks_large)} 个chunks")
|
||||
print(f"- 无重叠策略: {len(chunks_no_overlap)} 个chunks")
|
||||
logger.debug(f"\n测试总结:")
|
||||
logger.debug(f"- 小chunk策略: {len(chunks_small)} 个chunks")
|
||||
logger.debug(f"- 大chunk策略: {len(chunks_large)} 个chunks")
|
||||
logger.debug(f"- 无重叠策略: {len(chunks_no_overlap)} 个chunks")
|
||||
|
||||
|
||||
def demo_usage():
|
||||
"""
|
||||
演示如何使用新的分块功能
|
||||
"""
|
||||
print("=" * 60)
|
||||
print("使用示例")
|
||||
print("=" * 60)
|
||||
logger.debug("=" * 60)
|
||||
logger.debug("使用示例")
|
||||
logger.debug("=" * 60)
|
||||
|
||||
print("\n1. 使用传统的按行分块:")
|
||||
print("embed_document('document.txt', 'line_embedding.pkl', chunking_strategy='line')")
|
||||
logger.debug("\n1. 使用传统的按行分块:")
|
||||
logger.debug("embed_document('document.txt', 'line_embedding.pkl', chunking_strategy='line')")
|
||||
|
||||
print("\n2. 使用段落级分块(默认参数):")
|
||||
print("embed_document('document.txt', 'paragraph_embedding.pkl', chunking_strategy='paragraph')")
|
||||
logger.debug("\n2. 使用段落级分块(默认参数):")
|
||||
logger.debug("embed_document('document.txt', 'paragraph_embedding.pkl', chunking_strategy='paragraph')")
|
||||
|
||||
print("\n3. 使用自定义参数的段落级分块:")
|
||||
print("embed_document('document.txt', 'custom_embedding.pkl',")
|
||||
print(" chunking_strategy='paragraph',")
|
||||
print(" max_chunk_size=1500,")
|
||||
print(" overlap=200,")
|
||||
print(" min_chunk_size=300)")
|
||||
logger.debug("\n3. 使用自定义参数的段落级分块:")
|
||||
logger.debug("embed_document('document.txt', 'custom_embedding.pkl',")
|
||||
logger.debug(" chunking_strategy='paragraph',")
|
||||
logger.debug(" max_chunk_size=1500,")
|
||||
logger.debug(" overlap=200,")
|
||||
logger.debug(" min_chunk_size=300)")
|
||||
|
||||
|
||||
|
||||
@ -737,10 +741,10 @@ def cache_terms_embeddings(bot_id: str, terms_list: List[Dict[str, Any]]) -> Dic
|
||||
# 验证缓存数据是否匹配当前的terms
|
||||
current_hash = _generate_terms_hash(terms_list)
|
||||
if cached_data.get('hash') == current_hash:
|
||||
print(f"Using cached terms embeddings for {cache_key}")
|
||||
logger.info(f"Using cached terms embeddings for {cache_key}")
|
||||
return cached_data
|
||||
except Exception as e:
|
||||
print(f"Error loading cache: {e}")
|
||||
logger.error(f"Error loading cache: {e}")
|
||||
|
||||
# 准备要编码的文本
|
||||
term_texts = []
|
||||
@ -793,11 +797,11 @@ def cache_terms_embeddings(bot_id: str, terms_list: List[Dict[str, Any]]) -> Dic
|
||||
with open(cache_file, 'wb') as f:
|
||||
pickle.dump(cache_data, f)
|
||||
|
||||
print(f"Cached {len(term_texts)} terms embeddings to {cache_file}")
|
||||
logger.info(f"Cached {len(term_texts)} terms embeddings to {cache_file}")
|
||||
return cache_data
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error generating terms embeddings: {e}")
|
||||
logger.error(f"Error generating terms embeddings: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
@ -826,14 +830,14 @@ def search_similar_terms(query_text: str, cached_terms_data: Dict[str, Any]) ->
|
||||
term_info = cached_terms_data['term_info']
|
||||
|
||||
# 添加调试信息
|
||||
print(f"DEBUG: Query text: '{query_text}'")
|
||||
print(f"DEBUG: Query vector shape: {query_vector.shape}, norm: {np.linalg.norm(query_vector)}")
|
||||
logger.debug(f"DEBUG: Query text: '{query_text}'")
|
||||
logger.debug(f"DEBUG: Query vector shape: {query_vector.shape}, norm: {np.linalg.norm(query_vector)}")
|
||||
|
||||
# 计算cos相似度
|
||||
similarities = _cosine_similarity(query_vector, term_embeddings)
|
||||
|
||||
print(f"DEBUG: Similarities: {similarities}")
|
||||
print(f"DEBUG: Max similarity: {np.max(similarities):.3f}, Mean similarity: {np.mean(similarities):.3f}")
|
||||
logger.debug(f"DEBUG: Similarities: {similarities}")
|
||||
logger.debug(f"DEBUG: Max similarity: {np.max(similarities):.3f}, Mean similarity: {np.mean(similarities):.3f}")
|
||||
|
||||
# 获取所有terms的相似度
|
||||
matches = []
|
||||
@ -852,7 +856,7 @@ def search_similar_terms(query_text: str, cached_terms_data: Dict[str, Any]) ->
|
||||
return matches[:5]
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in similarity search: {e}")
|
||||
logger.error(f"Error in similarity search: {e}")
|
||||
return []
|
||||
|
||||
|
||||
|
||||
@ -18,8 +18,9 @@ import psutil
|
||||
import numpy as np
|
||||
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
|
||||
class GlobalModelManager:
|
||||
|
||||
@ -10,7 +10,11 @@ from fastapi import FastAPI
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from routes.file_manager import router as file_manager_router
|
||||
from utils.logger import logger
|
||||
import logging
|
||||
|
||||
|
||||
from utils.log_util.logger import init_with_fastapi
|
||||
|
||||
|
||||
# Import route modules
|
||||
from routes import chat, files, projects, system
|
||||
@ -20,6 +24,10 @@ from routes.system import agent_manager, connection_pool, file_cache
|
||||
|
||||
app = FastAPI(title="Database Assistant API", version="1.0.0")
|
||||
|
||||
init_with_fastapi(app)
|
||||
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
# 挂载public文件夹为静态文件服务
|
||||
app.mount("/public", StaticFiles(directory="public"), name="static")
|
||||
|
||||
@ -47,7 +55,7 @@ app.include_router(file_manager_router)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 启动 FastAPI 应用
|
||||
print("Starting FastAPI server...")
|
||||
print("File Manager API available at: http://localhost:8001/api/v1/files")
|
||||
print("Web Interface available at: http://localhost:8001/public/file-manager.html")
|
||||
logger.info("Starting FastAPI server...")
|
||||
logger.info("File Manager API available at: http://localhost:8001/api/v1/files")
|
||||
logger.info("Web Interface available at: http://localhost:8001/public/file-manager.html")
|
||||
uvicorn.run(app, host="0.0.0.0", port=8001)
|
||||
|
||||
@ -5,7 +5,9 @@ from typing import Union, Optional
|
||||
from fastapi import APIRouter, HTTPException, Header
|
||||
from fastapi.responses import StreamingResponse
|
||||
from pydantic import BaseModel
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger('app')
|
||||
from utils import (
|
||||
Message, ChatRequest, ChatResponse
|
||||
)
|
||||
@ -79,6 +81,7 @@ async def generate_stream_response(agent, messages, thought_list, tool_response:
|
||||
}
|
||||
yield f"data: {json.dumps(chunk_data, ensure_ascii=False)}\n\n"
|
||||
|
||||
logger.info(f"开始生成Agent流式响应, model: {model}")
|
||||
for response in agent.run(messages=messages):
|
||||
previous_content = accumulated_content
|
||||
accumulated_content = get_content_from_messages(response, tool_response=tool_response)
|
||||
@ -92,6 +95,8 @@ async def generate_stream_response(agent, messages, thought_list, tool_response:
|
||||
|
||||
# 只有当有新内容时才发送chunk
|
||||
if new_content:
|
||||
if chunk_id == 0:
|
||||
logger.info(f"Agent首个Token已生成, 开始流式输出")
|
||||
chunk_id += 1
|
||||
# 构造OpenAI格式的流式响应
|
||||
chunk_data = {
|
||||
@ -126,10 +131,11 @@ async def generate_stream_response(agent, messages, thought_list, tool_response:
|
||||
|
||||
# 发送结束标记
|
||||
yield "data: [DONE]\n\n"
|
||||
logger.info(f"Agent流式响应完成, 总共生成 {chunk_id} 个chunks")
|
||||
except Exception as e:
|
||||
import traceback
|
||||
error_details = traceback.format_exc()
|
||||
from utils.logger import logger
|
||||
|
||||
logger.error(f"Error in generate_stream_response: {str(e)}")
|
||||
logger.error(f"Full traceback: {error_details}")
|
||||
|
||||
@ -168,7 +174,7 @@ async def create_agent_and_generate_response(
|
||||
# 2. 如果有terms内容,先进行embedding(embedding需要缓存起来,这个可以tmp文件缓存,以{bot_id}_terms作为key)embedding实现情参考 @embedding/embedding.py 文件,可以在里面实现。拿到embedding后,可以进行相似性检索,检索方式先使用cos相似度,找到阈值相似性>0.7的匹配项,重新整理为terms_analysis,格式:1) Name: term_name1, Description: desc, Synonyms: syn1, syn2。
|
||||
terms_analysis = ""
|
||||
if terms_list:
|
||||
print(f"terms_list: {terms_list}")
|
||||
logger.info(f"terms_list: {terms_list}")
|
||||
# 从messages中提取用户的查询文本用于相似性检索
|
||||
query_text = get_user_last_message_content(messages)
|
||||
# 使用embedding进行terms处理
|
||||
@ -178,9 +184,9 @@ async def create_agent_and_generate_response(
|
||||
if terms_analysis:
|
||||
# 将terms分析结果也添加到消息中
|
||||
system_prompt = system_prompt.replace("{terms}", terms_analysis)
|
||||
print(f"Generated terms analysis: {terms_analysis[:200]}...") # 只打印前200个字符
|
||||
logger.info(f"Generated terms analysis: {terms_analysis[:200]}...") # 只打印前200个字符
|
||||
except Exception as e:
|
||||
print(f"Error processing terms with embedding: {e}")
|
||||
logger.error(f"Error processing terms with embedding: {e}")
|
||||
terms_analysis = ""
|
||||
else:
|
||||
# 当terms_list为空时,删除对应的pkl缓存文件
|
||||
@ -189,14 +195,14 @@ async def create_agent_and_generate_response(
|
||||
cache_file = f"projects/cache/{bot_id}_terms.pkl"
|
||||
if os.path.exists(cache_file):
|
||||
os.remove(cache_file)
|
||||
print(f"Removed empty terms cache file: {cache_file}")
|
||||
logger.info(f"Removed empty terms cache file: {cache_file}")
|
||||
except Exception as e:
|
||||
print(f"Error removing terms cache file: {e}")
|
||||
logger.error(f"Error removing terms cache file: {e}")
|
||||
|
||||
# 3. 如果有guideline内容,进行并发处理
|
||||
guideline_analysis = ""
|
||||
if guidelines_list:
|
||||
print(f"guidelines_list: {guidelines_list}")
|
||||
logger.info(f"guidelines_list: {guidelines_list}")
|
||||
guidelines_count = len(guidelines_list)
|
||||
|
||||
if guidelines_count > 0:
|
||||
@ -223,7 +229,7 @@ async def create_agent_and_generate_response(
|
||||
batches[-2].extend(batches[-1])
|
||||
batches.pop()
|
||||
|
||||
print(f"Processing {guidelines_count} guidelines in {len(batches)} batches with {batch_count} concurrent batches")
|
||||
logger.info(f"Processing {guidelines_count} guidelines in {len(batches)} batches with {batch_count} concurrent batches")
|
||||
|
||||
# 准备chat_history
|
||||
chat_history = format_messages_to_chat_history(messages)
|
||||
@ -265,13 +271,13 @@ async def create_agent_and_generate_response(
|
||||
# 处理结果:最后一个结果是agent,前面的是guideline批次结果
|
||||
agent = all_results[-1] # agent创建的结果
|
||||
batch_results = all_results[:-1] # guideline批次的结果
|
||||
print(f"batch_results:{batch_results}")
|
||||
logger.info(f"batch_results:{batch_results}")
|
||||
|
||||
# 合并guideline分析结果,使用JSON格式的checks数组
|
||||
all_checks = []
|
||||
for i, result in enumerate(batch_results):
|
||||
if isinstance(result, Exception):
|
||||
print(f"Guideline batch {i} failed: {result}")
|
||||
logger.error(f"Guideline batch {i} failed: {result}")
|
||||
continue
|
||||
if result and isinstance(result, dict) and 'checks' in result:
|
||||
# 如果是JSON对象且包含checks数组,只保留applies为true的checks
|
||||
@ -279,13 +285,13 @@ async def create_agent_and_generate_response(
|
||||
all_checks.extend(applicable_checks)
|
||||
elif result and isinstance(result, str) and result.strip():
|
||||
# 如果是普通文本,保留原有逻辑
|
||||
print(f"Non-JSON result from batch {i}: {result}")
|
||||
logger.info(f"Non-JSON result from batch {i}: {result}")
|
||||
|
||||
if all_checks:
|
||||
# 将checks数组格式化为JSON字符串
|
||||
guideline_analysis = "\n".join([item["rationale"] for item in all_checks])
|
||||
# guideline_analysis = json.dumps({"checks": all_checks}, ensure_ascii=False)
|
||||
print(f"Merged guideline analysis result: {guideline_analysis}")
|
||||
logger.info(f"Merged guideline analysis result: {guideline_analysis}")
|
||||
|
||||
# 将分析结果添加到最后一个消息的内容中
|
||||
if guideline_analysis:
|
||||
@ -430,8 +436,8 @@ async def chat_completions(request: ChatRequest, authorization: Optional[str] =
|
||||
except Exception as e:
|
||||
import traceback
|
||||
error_details = traceback.format_exc()
|
||||
print(f"Error in chat_completions: {str(e)}")
|
||||
print(f"Full traceback: {error_details}")
|
||||
logger.error(f"Error in chat_completions: {str(e)}")
|
||||
logger.error(f"Full traceback: {error_details}")
|
||||
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
||||
|
||||
|
||||
@ -526,6 +532,6 @@ async def chat_completions_v2(request: ChatRequestV2, authorization: Optional[st
|
||||
except Exception as e:
|
||||
import traceback
|
||||
error_details = traceback.format_exc()
|
||||
print(f"Error in chat_completions_v2: {str(e)}")
|
||||
print(f"Full traceback: {error_details}")
|
||||
logger.error(f"Error in chat_completions_v2: {str(e)}")
|
||||
logger.error(f"Full traceback: {error_details}")
|
||||
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
||||
|
||||
@ -13,6 +13,9 @@ from fastapi.responses import FileResponse, StreamingResponse
|
||||
import mimetypes
|
||||
import json
|
||||
import zipfile
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger('app')
|
||||
import tempfile
|
||||
import io
|
||||
import math
|
||||
@ -485,7 +488,7 @@ async def download_folder_as_zip(request: Dict[str, str]):
|
||||
zipf.write(file_path, arcname)
|
||||
except (OSError, IOError) as e:
|
||||
# 跳过无法读取的文件,但记录警告
|
||||
print(f"Warning: Skipping file {file_path}: {e}")
|
||||
logger.warning(f"Warning: Skipping file {file_path}: {e}")
|
||||
continue
|
||||
|
||||
zip_buffer.seek(0)
|
||||
@ -586,7 +589,7 @@ async def download_multiple_items_as_zip(request: Dict[str, Any]):
|
||||
try:
|
||||
zipf.write(target_path, target_path.name)
|
||||
except (OSError, IOError) as e:
|
||||
print(f"Warning: Skipping file {target_path}: {e}")
|
||||
logger.warning(f"Warning: Skipping file {target_path}: {e}")
|
||||
continue
|
||||
elif target_path.is_dir():
|
||||
# 文件夹
|
||||
@ -597,7 +600,7 @@ async def download_multiple_items_as_zip(request: Dict[str, Any]):
|
||||
arcname = f"{target_path.name}/{file_path.relative_to(target_path)}"
|
||||
zipf.write(file_path, arcname)
|
||||
except (OSError, IOError) as e:
|
||||
print(f"Warning: Skipping file {file_path}: {e}")
|
||||
logger.warning(f"Warning: Skipping file {file_path}: {e}")
|
||||
continue
|
||||
|
||||
zip_buffer.seek(0)
|
||||
|
||||
@ -5,6 +5,9 @@ from datetime import datetime
|
||||
from typing import Optional, List
|
||||
from fastapi import APIRouter, HTTPException, Header, UploadFile, File, Form
|
||||
from pydantic import BaseModel
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
from utils import (
|
||||
DatasetRequest, QueueTaskRequest, IncrementalTaskRequest, QueueTaskResponse,
|
||||
@ -83,7 +86,7 @@ async def process_files_async_endpoint(request: QueueTaskRequest, authorization:
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"Error submitting async file processing task: {str(e)}")
|
||||
logger.error(f"Error submitting async file processing task: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
||||
|
||||
|
||||
@ -146,7 +149,7 @@ async def process_files_incremental_endpoint(request: IncrementalTaskRequest, au
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"Error submitting incremental file processing task: {str(e)}")
|
||||
logger.error(f"Error submitting incremental file processing task: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
||||
|
||||
|
||||
@ -260,7 +263,7 @@ async def cleanup_project_async_endpoint(dataset_id: str, remove_all: bool = Fal
|
||||
"action": "remove_all" if remove_all else "cleanup_logs"
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Error submitting cleanup task: {str(e)}")
|
||||
logger.error(f"Error submitting cleanup task: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"提交清理任务失败: {str(e)}")
|
||||
|
||||
|
||||
@ -281,8 +284,8 @@ async def upload_file(file: UploadFile = File(...), folder: Optional[str] = Form
|
||||
"""
|
||||
try:
|
||||
# 调试信息
|
||||
print(f"Received folder parameter: {folder}")
|
||||
print(f"File received: {file.filename if file else 'None'}")
|
||||
logger.info(f"Received folder parameter: {folder}")
|
||||
logger.info(f"File received: {file.filename if file else 'None'}")
|
||||
|
||||
# 确定上传文件夹
|
||||
if folder:
|
||||
@ -345,7 +348,7 @@ async def upload_file(file: UploadFile = File(...), folder: Optional[str] = Form
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"Error uploading file: {str(e)}")
|
||||
logger.error(f"Error uploading file: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"文件上传失败: {str(e)}")
|
||||
|
||||
|
||||
@ -377,7 +380,7 @@ async def get_task_status(task_id: str):
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error getting task status: {str(e)}")
|
||||
logger.error(f"Error getting task status: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"获取任务状态失败: {str(e)}")
|
||||
|
||||
|
||||
@ -399,7 +402,7 @@ async def delete_task(task_id: str):
|
||||
"task_id": task_id
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Error deleting task: {str(e)}")
|
||||
logger.error(f"Error deleting task: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"删除任务记录失败: {str(e)}")
|
||||
|
||||
|
||||
@ -428,7 +431,7 @@ async def list_tasks(status: Optional[str] = None, dataset_id: Optional[str] = N
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error listing tasks: {str(e)}")
|
||||
logger.error(f"Error listing tasks: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"获取任务列表失败: {str(e)}")
|
||||
|
||||
|
||||
@ -445,7 +448,7 @@ async def get_task_statistics():
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error getting statistics: {str(e)}")
|
||||
logger.error(f"Error getting statistics: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"获取统计信息失败: {str(e)}")
|
||||
|
||||
|
||||
@ -463,5 +466,5 @@ async def cleanup_tasks(older_than_days: int = 7):
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error cleaning up tasks: {str(e)}")
|
||||
logger.error(f"Error cleaning up tasks: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"清理任务记录失败: {str(e)}")
|
||||
@ -2,6 +2,9 @@ import os
|
||||
import json
|
||||
from typing import Optional
|
||||
from fastapi import APIRouter, HTTPException
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
from task_queue.task_status import task_status_store
|
||||
|
||||
@ -45,7 +48,7 @@ async def list_all_projects():
|
||||
"updated_at": os.path.getmtime(item_path)
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error reading robot project {item}: {str(e)}")
|
||||
logger.error(f"Error reading robot project {item}: {str(e)}")
|
||||
robot_projects.append({
|
||||
"id": item,
|
||||
"name": item,
|
||||
@ -95,7 +98,7 @@ async def list_all_projects():
|
||||
"updated_at": os.path.getmtime(item_path)
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error reading dataset {item}: {str(e)}")
|
||||
logger.error(f"Error reading dataset {item}: {str(e)}")
|
||||
datasets.append({
|
||||
"id": item,
|
||||
"name": f"数据集 - {item[:8]}...",
|
||||
@ -118,7 +121,7 @@ async def list_all_projects():
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error listing projects: {str(e)}")
|
||||
logger.error(f"Error listing projects: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"获取项目列表失败: {str(e)}")
|
||||
|
||||
|
||||
@ -134,7 +137,7 @@ async def list_robot_projects():
|
||||
"projects": response["robot_projects"]
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Error listing robot projects: {str(e)}")
|
||||
logger.error(f"Error listing robot projects: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"获取机器人项目列表失败: {str(e)}")
|
||||
|
||||
|
||||
@ -150,7 +153,7 @@ async def list_datasets():
|
||||
"projects": response["datasets"]
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Error listing datasets: {str(e)}")
|
||||
logger.error(f"Error listing datasets: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"获取数据集列表失败: {str(e)}")
|
||||
|
||||
|
||||
@ -169,5 +172,5 @@ async def get_project_tasks(dataset_id: str):
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error getting project tasks: {str(e)}")
|
||||
logger.error(f"Error getting project tasks: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"获取项目任务失败: {str(e)}")
|
||||
@ -19,6 +19,9 @@ except ImportError:
|
||||
from utils.fastapi_utils import get_content_from_messages
|
||||
from embedding import get_model_manager
|
||||
from pydantic import BaseModel
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@ -38,7 +41,7 @@ class EncodeResponse(BaseModel):
|
||||
|
||||
|
||||
# 系统优化设置初始化
|
||||
print("正在初始化系统优化...")
|
||||
logger.info("正在初始化系统优化...")
|
||||
system_optimizer = setup_system_optimizations()
|
||||
|
||||
# 全局助手管理器配置(使用优化后的配置)
|
||||
@ -66,10 +69,10 @@ file_cache = init_global_file_cache(
|
||||
ttl=int(os.getenv("FILE_CACHE_TTL", "300"))
|
||||
)
|
||||
|
||||
print("系统优化初始化完成")
|
||||
print(f"- 分片Agent管理器: {shard_count} 个分片,最多缓存 {max_cached_agents} 个agent")
|
||||
print(f"- 连接池: 每主机100连接,总计500连接")
|
||||
print(f"- 文件缓存: 1000个文件,TTL 300秒")
|
||||
logger.info("系统优化初始化完成")
|
||||
logger.info(f"- 分片Agent管理器: {shard_count} 个分片,最多缓存 {max_cached_agents} 个agent")
|
||||
logger.info(f"- 连接池: 每主机100连接,总计500连接")
|
||||
logger.info(f"- 文件缓存: 1000个文件,TTL 300秒")
|
||||
|
||||
|
||||
@router.get("/api/health")
|
||||
@ -128,7 +131,7 @@ async def get_performance_stats():
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error getting performance stats: {str(e)}")
|
||||
logger.error(f"Error getting performance stats: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"获取性能统计失败: {str(e)}")
|
||||
|
||||
|
||||
@ -146,7 +149,7 @@ async def optimize_system(profile: str = "balanced"):
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error applying optimization profile: {str(e)}")
|
||||
logger.error(f"Error applying optimization profile: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"应用优化配置失败: {str(e)}")
|
||||
|
||||
|
||||
@ -175,7 +178,7 @@ async def clear_system_cache(cache_type: Optional[str] = None):
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error clearing cache: {str(e)}")
|
||||
logger.error(f"Error clearing cache: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"清理缓存失败: {str(e)}")
|
||||
|
||||
|
||||
@ -197,7 +200,7 @@ async def get_system_config():
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error getting system config: {str(e)}")
|
||||
logger.error(f"Error getting system config: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"获取系统配置失败: {str(e)}")
|
||||
|
||||
|
||||
@ -260,7 +263,7 @@ async def encode_texts(request: EncodeRequest):
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
from utils.logger import logger
|
||||
|
||||
logger.error(f"文本编码 API 错误: {e}")
|
||||
return EncodeResponse(
|
||||
success=False,
|
||||
@ -269,4 +272,4 @@ async def encode_texts(request: EncodeRequest):
|
||||
processing_time=0.0,
|
||||
total_texts=len(request.texts) if request else 0,
|
||||
error=f"编码失败: {str(e)}"
|
||||
)
|
||||
)
|
||||
|
||||
@ -4,9 +4,13 @@ Queue configuration using SqliteHuey for asynchronous file processing.
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
from huey import SqliteHuey
|
||||
from datetime import timedelta
|
||||
|
||||
# 配置日志
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
# 确保projects/queue_data目录存在
|
||||
queue_data_dir = os.path.join(os.path.dirname(__file__), '..', 'projects', 'queue_data')
|
||||
os.makedirs(queue_data_dir, exist_ok=True)
|
||||
@ -24,4 +28,4 @@ huey.store_errors = True # 存储错误信息
|
||||
huey.max_retries = 3 # 最大重试次数
|
||||
huey.retry_delay = timedelta(seconds=60) # 重试延迟
|
||||
|
||||
print(f"SqliteHuey队列已初始化,数据库路径: {os.path.join(queue_data_dir, 'huey.db')}")
|
||||
logger.info(f"SqliteHuey队列已初始化,数据库路径: {os.path.join(queue_data_dir, 'huey.db')}")
|
||||
@ -6,11 +6,15 @@ Queue manager for handling file processing queues.
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, List, Optional, Any
|
||||
from huey import Huey
|
||||
from huey.api import Task
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
# 配置日志
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
from .config import huey
|
||||
from .tasks import process_file_async, process_multiple_files_async, process_zip_file_async, cleanup_processed_files
|
||||
|
||||
@ -20,7 +24,7 @@ class QueueManager:
|
||||
|
||||
def __init__(self):
|
||||
self.huey = huey
|
||||
print(f"队列管理器已初始化,使用数据库: {os.path.join(os.path.dirname(__file__), '..', 'projects', 'queue_data', 'huey.db')}")
|
||||
logger.info(f"队列管理器已初始化,使用数据库: {os.path.join(os.path.dirname(__file__), '..', 'projects', 'queue_data', 'huey.db')}")
|
||||
|
||||
def enqueue_file(
|
||||
self,
|
||||
@ -49,7 +53,7 @@ class QueueManager:
|
||||
else:
|
||||
task = process_file_async(project_id, file_path, original_filename)
|
||||
|
||||
print(f"文件已加入队列: {file_path}, 任务ID: {task.id}")
|
||||
logger.info(f"文件已加入队列: {file_path}, 任务ID: {task.id}")
|
||||
return task.id
|
||||
|
||||
def enqueue_multiple_files(
|
||||
@ -79,7 +83,7 @@ class QueueManager:
|
||||
else:
|
||||
task = process_multiple_files_async(project_id, file_paths, original_filenames)
|
||||
|
||||
print(f"批量文件已加入队列: {len(file_paths)} 个文件, 任务ID: {task.id}")
|
||||
logger.info(f"批量文件已加入队列: {len(file_paths)} 个文件, 任务ID: {task.id}")
|
||||
return [task.id]
|
||||
|
||||
def enqueue_zip_file(
|
||||
@ -109,7 +113,7 @@ class QueueManager:
|
||||
else:
|
||||
task = process_zip_file_async(project_id, zip_path, extract_to)
|
||||
|
||||
print(f"zip文件已加入队列: {zip_path}, 任务ID: {task.id}")
|
||||
logger.info(f"zip文件已加入队列: {zip_path}, 任务ID: {task.id}")
|
||||
return task.id
|
||||
|
||||
def get_task_status(self, task_id: str) -> Dict[str, Any]:
|
||||
@ -201,7 +205,7 @@ class QueueManager:
|
||||
stats["pending_tasks"] = len(pending_tasks)
|
||||
stats["total_tasks"] += len(pending_tasks)
|
||||
except Exception as e:
|
||||
print(f"获取pending任务失败: {e}")
|
||||
logger.error(f"获取pending任务失败: {e}")
|
||||
|
||||
# 尝试获取定时任务数量
|
||||
try:
|
||||
@ -209,7 +213,7 @@ class QueueManager:
|
||||
stats["scheduled_tasks"] = len(scheduled_tasks)
|
||||
stats["total_tasks"] += len(scheduled_tasks)
|
||||
except Exception as e:
|
||||
print(f"获取scheduled任务失败: {e}")
|
||||
logger.error(f"获取scheduled任务失败: {e}")
|
||||
|
||||
return stats
|
||||
|
||||
@ -237,7 +241,7 @@ class QueueManager:
|
||||
else:
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"取消任务失败: {str(e)}")
|
||||
logger.error(f"取消任务失败: {str(e)}")
|
||||
return False
|
||||
|
||||
def cleanup_old_tasks(self, older_than_days: int = 7) -> Dict[str, Any]:
|
||||
@ -292,7 +296,7 @@ class QueueManager:
|
||||
else:
|
||||
task = cleanup_processed_files(project_id, older_than_days)
|
||||
|
||||
print(f"清理任务已加入队列: 项目 {project_id}, 任务ID: {task.id}")
|
||||
logger.info(f"清理任务已加入队列: 项目 {project_id}, 任务ID: {task.id}")
|
||||
return task.id
|
||||
|
||||
def list_pending_tasks(self, limit: int = 50) -> List[Dict[str, Any]]:
|
||||
@ -318,7 +322,7 @@ class QueueManager:
|
||||
"status": "pending",
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"获取pending任务失败: {e}")
|
||||
logger.error(f"获取pending任务失败: {e}")
|
||||
|
||||
# 获取scheduled任务
|
||||
try:
|
||||
@ -330,12 +334,12 @@ class QueueManager:
|
||||
"status": "scheduled",
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"获取scheduled任务失败: {e}")
|
||||
logger.error(f"获取scheduled任务失败: {e}")
|
||||
|
||||
return pending_tasks
|
||||
|
||||
except Exception as e:
|
||||
print(f"获取待处理任务失败: {str(e)}")
|
||||
logger.error(f"获取待处理任务失败: {str(e)}")
|
||||
return []
|
||||
|
||||
def get_task_result(self, task_id: str) -> Optional[Any]:
|
||||
@ -354,7 +358,7 @@ class QueueManager:
|
||||
return task.get()
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"获取任务结果失败: {str(e)}")
|
||||
logger.error(f"获取任务结果失败: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
@ -9,10 +9,14 @@ import time
|
||||
import signal
|
||||
import argparse
|
||||
import multiprocessing
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import threading
|
||||
|
||||
# 配置日志
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
# 添加项目根目录到Python路径
|
||||
project_root = Path(__file__).parent.parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
@ -50,7 +54,7 @@ class OptimizedQueueConsumer:
|
||||
|
||||
def _signal_handler(self, signum, frame):
|
||||
"""信号处理器,用于优雅关闭"""
|
||||
print(f"\n收到信号 {signum},正在关闭队列消费者...")
|
||||
logger.info(f"\n收到信号 {signum},正在关闭队列消费者...")
|
||||
self.running = False
|
||||
if self.consumer:
|
||||
self.consumer.stop()
|
||||
@ -80,9 +84,9 @@ class OptimizedQueueConsumer:
|
||||
if hasattr(huey, 'always_eager'):
|
||||
huey.always_eager = False
|
||||
|
||||
print(f"队列消费者优化设置完成:")
|
||||
print(f"- 工作类型: {self.worker_type}")
|
||||
print(f"- 工作线程数: {self.workers}")
|
||||
logger.info(f"队列消费者优化设置完成:")
|
||||
logger.info(f"- 工作类型: {self.worker_type}")
|
||||
logger.info(f"- 工作线程数: {self.workers}")
|
||||
|
||||
def monitor_performance(self):
|
||||
"""性能监控线程"""
|
||||
@ -93,27 +97,26 @@ class OptimizedQueueConsumer:
|
||||
elapsed = time.time() - self.start_time
|
||||
rate = self.performance_stats['tasks_processed'] / max(1, elapsed)
|
||||
|
||||
print(f"\n[性能统计]")
|
||||
print(f"- 运行时间: {elapsed:.1f}秒")
|
||||
print(f"- 已处理任务: {self.performance_stats['tasks_processed']}")
|
||||
print(f"- 失败任务: {self.performance_stats['tasks_failed']}")
|
||||
print(f"- 平均处理速率: {rate:.2f} 任务/秒")
|
||||
logger.info(f"\n[性能统计]")
|
||||
logger.info(f"- 运行时间: {elapsed:.1f}秒")
|
||||
logger.info(f"- 已处理任务: {self.performance_stats['tasks_processed']}")
|
||||
logger.info(f"- 失败任务: {self.performance_stats['tasks_failed']}")
|
||||
logger.info(f"- 平均处理速率: {rate:.2f} 任务/秒")
|
||||
|
||||
if self.performance_stats['avg_processing_time'] > 0:
|
||||
print(f"- 平均处理时间: {self.performance_stats['avg_processing_time']:.2f}秒")
|
||||
logger.info(f"- 平均处理时间: {self.performance_stats['avg_processing_time']:.2f}秒")
|
||||
|
||||
def start(self):
|
||||
"""启动队列消费者"""
|
||||
print("=" * 60)
|
||||
print("优化的队列消费者启动")
|
||||
print("=" * 60)
|
||||
logger.info("=" * 60)
|
||||
logger.info("优化的队列消费者启动")
|
||||
logger.info("=" * 60)
|
||||
|
||||
# 设置优化
|
||||
self.setup_optimizations()
|
||||
|
||||
print(f"数据库: {os.path.join(os.path.dirname(__file__), '..', 'projects', 'queue_data', 'huey.db')}")
|
||||
print("按 Ctrl+C 停止消费者")
|
||||
print()
|
||||
logger.info(f"数据库: {os.path.join(os.path.dirname(__file__), '..', 'projects', 'queue_data', 'huey.db')}")
|
||||
logger.info("按 Ctrl+C 停止消费者")
|
||||
|
||||
self.running = True
|
||||
self.start_time = time.time()
|
||||
@ -134,15 +137,15 @@ class OptimizedQueueConsumer:
|
||||
periodic=True, # 启用定期任务
|
||||
)
|
||||
|
||||
print("队列消费者已启动,等待任务...")
|
||||
logger.info("队列消费者已启动,等待任务...")
|
||||
|
||||
# 启动消费者
|
||||
self.consumer.run()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n收到键盘中断信号")
|
||||
logger.info("\n收到键盘中断信号")
|
||||
except Exception as e:
|
||||
print(f"队列消费者运行错误: {e}")
|
||||
logger.error(f"队列消费者运行错误: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
@ -150,27 +153,27 @@ class OptimizedQueueConsumer:
|
||||
|
||||
def shutdown(self):
|
||||
"""关闭队列消费者"""
|
||||
print("\n正在关闭队列消费者...")
|
||||
logger.info("\n正在关闭队列消费者...")
|
||||
self.running = False
|
||||
|
||||
if self.consumer:
|
||||
try:
|
||||
self.consumer.stop()
|
||||
print("队列消费者已停止")
|
||||
logger.info("队列消费者已停止")
|
||||
except Exception as e:
|
||||
print(f"停止队列消费者时出错: {e}")
|
||||
logger.error(f"停止队列消费者时出错: {e}")
|
||||
|
||||
# 输出最终统计
|
||||
if self.start_time:
|
||||
elapsed = time.time() - self.start_time
|
||||
print(f"\n[最终统计]")
|
||||
print(f"- 总运行时间: {elapsed:.1f}秒")
|
||||
print(f"- 总处理任务: {self.performance_stats['tasks_processed']}")
|
||||
print(f"- 总失败任务: {self.performance_stats['tasks_failed']}")
|
||||
logger.info(f"\n[最终统计]")
|
||||
logger.info(f"- 总运行时间: {elapsed:.1f}秒")
|
||||
logger.info(f"- 总处理任务: {self.performance_stats['tasks_processed']}")
|
||||
logger.info(f"- 总失败任务: {self.performance_stats['tasks_failed']}")
|
||||
|
||||
if self.performance_stats['tasks_processed'] > 0:
|
||||
rate = self.performance_stats['tasks_processed'] / elapsed
|
||||
print(f"- 平均处理速率: {rate:.2f} 任务/秒")
|
||||
logger.info(f"- 平均处理速率: {rate:.2f} 任务/秒")
|
||||
|
||||
|
||||
def calculate_optimal_workers():
|
||||
@ -191,25 +194,25 @@ def check_queue_status():
|
||||
try:
|
||||
stats = queue_manager.get_queue_stats()
|
||||
|
||||
print("\n[队列状态]")
|
||||
logger.info("\n[队列状态]")
|
||||
if isinstance(stats, dict):
|
||||
if 'total_tasks' in stats:
|
||||
print(f"- 总任务数: {stats['total_tasks']}")
|
||||
logger.info(f"- 总任务数: {stats['total_tasks']}")
|
||||
if 'pending_tasks' in stats:
|
||||
print(f"- 待处理任务: {stats['pending_tasks']}")
|
||||
logger.info(f"- 待处理任务: {stats['pending_tasks']}")
|
||||
if 'scheduled_tasks' in stats:
|
||||
print(f"- 定时任务: {stats['scheduled_tasks']}")
|
||||
logger.info(f"- 定时任务: {stats['scheduled_tasks']}")
|
||||
|
||||
# 检查数据库文件
|
||||
db_path = os.path.join(os.path.dirname(__file__), '..', 'projects', 'queue_data', 'huey.db')
|
||||
if os.path.exists(db_path):
|
||||
size = os.path.getsize(db_path)
|
||||
print(f"- 数据库大小: {size} 字节")
|
||||
logger.info(f"- 数据库大小: {size} 字节")
|
||||
else:
|
||||
print("- 数据库文件: 不存在")
|
||||
logger.info("- 数据库文件: 不存在")
|
||||
|
||||
except Exception as e:
|
||||
print(f"获取队列状态失败: {e}")
|
||||
logger.error(f"获取队列状态失败: {e}")
|
||||
|
||||
|
||||
def main():
|
||||
@ -248,11 +251,11 @@ def main():
|
||||
os.environ['PYTHONOPTIMIZE'] = '1'
|
||||
if args.workers > 2:
|
||||
args.workers = 2
|
||||
print(f"低内存模式: 调整工作线程数为 {args.workers}")
|
||||
logger.info(f"低内存模式: 调整工作线程数为 {args.workers}")
|
||||
elif args.profile == "high_performance":
|
||||
if args.workers < 4:
|
||||
args.workers = 4
|
||||
print(f"高性能模式: 调整工作线程数为 {args.workers}")
|
||||
logger.info(f"高性能模式: 调整工作线程数为 {args.workers}")
|
||||
|
||||
# 检查队列状态
|
||||
if args.check_status:
|
||||
@ -263,12 +266,12 @@ def main():
|
||||
try:
|
||||
import psutil
|
||||
memory = psutil.virtual_memory()
|
||||
print(f"[系统信息]")
|
||||
print(f"- CPU核心数: {multiprocessing.cpu_count()}")
|
||||
print(f"- 可用内存: {memory.available / (1024**3):.1f}GB")
|
||||
print(f"- 内存使用率: {memory.percent:.1f}%")
|
||||
logger.info(f"[系统信息]")
|
||||
logger.info(f"- CPU核心数: {multiprocessing.cpu_count()}")
|
||||
logger.info(f"- 可用内存: {memory.available / (1024**3):.1f}GB")
|
||||
logger.info(f"- 内存使用率: {memory.percent:.1f}%")
|
||||
except ImportError:
|
||||
print("[提示] 安装 psutil 可显示系统信息: pip install psutil")
|
||||
logger.info("[提示] 安装 psutil 可显示系统信息: pip install psutil")
|
||||
|
||||
# 创建并启动队列消费者
|
||||
consumer = OptimizedQueueConsumer(
|
||||
|
||||
@ -7,10 +7,14 @@ import os
|
||||
import json
|
||||
import time
|
||||
import shutil
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any
|
||||
from huey import crontab
|
||||
|
||||
# 配置日志
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
from .config import huey
|
||||
from utils.file_utils import (
|
||||
extract_zip_file,
|
||||
@ -42,7 +46,7 @@ def process_file_async(
|
||||
处理结果字典
|
||||
"""
|
||||
try:
|
||||
print(f"开始处理文件: {file_path}")
|
||||
logger.info(f"开始处理文件: {file_path}")
|
||||
|
||||
# 确保项目目录存在
|
||||
project_dir = os.path.join("projects", project_id)
|
||||
@ -55,7 +59,7 @@ def process_file_async(
|
||||
# 检查文件是否已处理
|
||||
processed_log = load_processed_files_log(project_id)
|
||||
if file_hash in processed_log:
|
||||
print(f"文件已处理,跳过: {file_path}")
|
||||
logger.info(f"文件已处理,跳过: {file_path}")
|
||||
return {
|
||||
"status": "skipped",
|
||||
"message": "文件已处理",
|
||||
@ -84,12 +88,12 @@ def process_file_async(
|
||||
result["file_hash"] = file_hash
|
||||
result["project_id"] = project_id
|
||||
|
||||
print(f"文件处理完成: {file_path}, 状态: {result['status']}")
|
||||
logger.info(f"文件处理完成: {file_path}, 状态: {result['status']}")
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"处理文件时发生错误: {str(e)}"
|
||||
print(error_msg)
|
||||
logger.error(error_msg)
|
||||
return {
|
||||
"status": "error",
|
||||
"message": error_msg,
|
||||
@ -116,7 +120,7 @@ def process_multiple_files_async(
|
||||
处理结果列表
|
||||
"""
|
||||
try:
|
||||
print(f"开始批量处理 {len(file_paths)} 个文件")
|
||||
logger.info(f"开始批量处理 {len(file_paths)} 个文件")
|
||||
|
||||
results = []
|
||||
for i, file_path in enumerate(file_paths):
|
||||
@ -126,12 +130,12 @@ def process_multiple_files_async(
|
||||
result = process_file_async(project_id, file_path, original_filename)
|
||||
results.append(result)
|
||||
|
||||
print(f"批量文件处理任务已提交,共 {len(results)} 个文件")
|
||||
logger.info(f"批量文件处理任务已提交,共 {len(results)} 个文件")
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"批量处理文件时发生错误: {str(e)}"
|
||||
print(error_msg)
|
||||
logger.error(error_msg)
|
||||
return [{
|
||||
"status": "error",
|
||||
"message": error_msg,
|
||||
@ -157,7 +161,7 @@ def process_zip_file_async(
|
||||
处理结果字典
|
||||
"""
|
||||
try:
|
||||
print(f"开始处理zip文件: {zip_path}")
|
||||
logger.info(f"开始处理zip文件: {zip_path}")
|
||||
|
||||
# 设置解压目录
|
||||
if extract_to is None:
|
||||
@ -191,7 +195,7 @@ def process_zip_file_async(
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"处理zip文件时发生错误: {str(e)}"
|
||||
print(error_msg)
|
||||
logger.error(error_msg)
|
||||
return {
|
||||
"status": "error",
|
||||
"message": error_msg,
|
||||
@ -216,7 +220,7 @@ def cleanup_processed_files(
|
||||
清理结果字典
|
||||
"""
|
||||
try:
|
||||
print(f"开始清理项目 {project_id} 中 {older_than_days} 天前的文件")
|
||||
logger.info(f"开始清理项目 {project_id} 中 {older_than_days} 天前的文件")
|
||||
|
||||
project_dir = os.path.join("projects", project_id)
|
||||
if not os.path.exists(project_dir):
|
||||
@ -240,9 +244,9 @@ def cleanup_processed_files(
|
||||
try:
|
||||
os.remove(file_path)
|
||||
cleaned_files.append(file_path)
|
||||
print(f"已删除旧文件: {file_path}")
|
||||
logger.info(f"已删除旧文件: {file_path}")
|
||||
except Exception as e:
|
||||
print(f"删除文件失败 {file_path}: {str(e)}")
|
||||
logger.error(f"删除文件失败 {file_path}: {str(e)}")
|
||||
|
||||
# 清理空目录
|
||||
for root, dirs, files in os.walk(project_dir, topdown=False):
|
||||
@ -251,9 +255,9 @@ def cleanup_processed_files(
|
||||
try:
|
||||
if not os.listdir(dir_path):
|
||||
os.rmdir(dir_path)
|
||||
print(f"已删除空目录: {dir_path}")
|
||||
logger.info(f"已删除空目录: {dir_path}")
|
||||
except Exception as e:
|
||||
print(f"删除目录失败 {dir_path}: {str(e)}")
|
||||
logger.error(f"删除目录失败 {dir_path}: {str(e)}")
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
@ -265,7 +269,7 @@ def cleanup_processed_files(
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"清理文件时发生错误: {str(e)}"
|
||||
print(error_msg)
|
||||
logger.error(error_msg)
|
||||
return {
|
||||
"status": "error",
|
||||
"message": error_msg,
|
||||
@ -351,6 +355,6 @@ def _process_single_file(
|
||||
@huey.periodic_task(crontab(hour=2, minute=0))
|
||||
def daily_cleanup():
|
||||
"""每日清理任务"""
|
||||
print("执行每日清理任务")
|
||||
logger.info("执行每日清理任务")
|
||||
# 这里可以添加清理逻辑
|
||||
return {"status": "completed", "message": "每日清理任务完成"}
|
||||
|
||||
@ -2,7 +2,7 @@ import asyncio
|
||||
from typing import List, Optional
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
|
||||
class AgentPool:
|
||||
@ -175,4 +175,4 @@ async def release_agent_to_pool(agent):
|
||||
if _global_agent_pool is None:
|
||||
raise RuntimeError("全局助手实例池未初始化")
|
||||
|
||||
await _global_agent_pool.release_agent(agent)
|
||||
await _global_agent_pool.release_agent(agent)
|
||||
|
||||
@ -8,6 +8,7 @@ import json
|
||||
import asyncio
|
||||
import aiofiles
|
||||
import aiofiles.os
|
||||
import logging
|
||||
from typing import Dict, List, Optional, Any
|
||||
from pathlib import Path
|
||||
import weakref
|
||||
@ -15,6 +16,9 @@ import threading
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
# Configure logger
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
|
||||
class AsyncFileCache:
|
||||
"""异步文件缓存管理器"""
|
||||
@ -70,7 +74,7 @@ class AsyncFileCache:
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading file {abs_path}: {e}")
|
||||
logger.error(f"Error reading file {abs_path}: {e}")
|
||||
return ""
|
||||
|
||||
def _read_text_file(self, file_path: str, encoding: str) -> str:
|
||||
@ -90,7 +94,7 @@ class AsyncFileCache:
|
||||
try:
|
||||
return json.loads(content)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error parsing JSON from {file_path}: {e}")
|
||||
logger.error(f"Error parsing JSON from {file_path}: {e}")
|
||||
return {}
|
||||
|
||||
async def write_file(self, file_path: str, content: str, encoding: str = 'utf-8'):
|
||||
@ -244,7 +248,7 @@ class ParallelFileReader:
|
||||
content = await task
|
||||
results[file_path] = content
|
||||
except Exception as e:
|
||||
print(f"Error reading {file_path}: {e}")
|
||||
logger.error(f"Error reading {file_path}: {e}")
|
||||
results[file_path] = ""
|
||||
|
||||
return results
|
||||
@ -269,7 +273,7 @@ class ParallelFileReader:
|
||||
try:
|
||||
results[file_path] = json.loads(content)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error parsing JSON from {file_path}: {e}")
|
||||
logger.error(f"Error parsing JSON from {file_path}: {e}")
|
||||
results[file_path] = {}
|
||||
else:
|
||||
results[file_path] = {}
|
||||
|
||||
@ -5,15 +5,19 @@ Data merging functions for combining processed file results.
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import logging
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
import json
|
||||
|
||||
# Configure logger
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
# Try to import numpy, but handle if missing
|
||||
try:
|
||||
import numpy as np
|
||||
NUMPY_SUPPORT = True
|
||||
except ImportError:
|
||||
print("NumPy not available, some embedding features may be limited")
|
||||
logger.warning("NumPy not available, some embedding features may be limited")
|
||||
NUMPY_SUPPORT = False
|
||||
|
||||
|
||||
@ -66,7 +70,7 @@ def merge_documents_by_group(unique_id: str, group_name: str) -> Dict:
|
||||
result["source_files"].append(filename_stem)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading document file {document_path}: {str(e)}")
|
||||
logger.error(f"Error reading document file {document_path}: {str(e)}")
|
||||
continue
|
||||
|
||||
if merged_content:
|
||||
@ -83,7 +87,7 @@ def merge_documents_by_group(unique_id: str, group_name: str) -> Dict:
|
||||
|
||||
except Exception as e:
|
||||
result["error"] = f"Document merging failed: {str(e)}"
|
||||
print(f"Error merging documents for group {group_name}: {str(e)}")
|
||||
logger.error(f"Error merging documents for group {group_name}: {str(e)}")
|
||||
|
||||
return result
|
||||
|
||||
@ -136,7 +140,7 @@ def merge_paginations_by_group(unique_id: str, group_name: str) -> Dict:
|
||||
result["source_files"].append(filename_stem)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading pagination file {pagination_path}: {str(e)}")
|
||||
logger.error(f"Error reading pagination file {pagination_path}: {str(e)}")
|
||||
continue
|
||||
|
||||
if merged_lines:
|
||||
@ -153,7 +157,7 @@ def merge_paginations_by_group(unique_id: str, group_name: str) -> Dict:
|
||||
|
||||
except Exception as e:
|
||||
result["error"] = f"Pagination merging failed: {str(e)}"
|
||||
print(f"Error merging paginations for group {group_name}: {str(e)}")
|
||||
logger.error(f"Error merging paginations for group {group_name}: {str(e)}")
|
||||
|
||||
return result
|
||||
|
||||
@ -236,7 +240,7 @@ def merge_embeddings_by_group(unique_id: str, group_name: str) -> Dict:
|
||||
result["source_files"].append(filename_stem)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading embedding file {embedding_path}: {str(e)}")
|
||||
logger.error(f"Error loading embedding file {embedding_path}: {str(e)}")
|
||||
continue
|
||||
|
||||
if all_chunks and all_embeddings:
|
||||
@ -259,7 +263,7 @@ def merge_embeddings_by_group(unique_id: str, group_name: str) -> Dict:
|
||||
np_embeddings.append(emb)
|
||||
else:
|
||||
# 如果无法转换,跳过这个文件
|
||||
print(f"Warning: Cannot convert embedding to numpy from file {filename_stem}")
|
||||
logger.warning(f"Warning: Cannot convert embedding to numpy from file {filename_stem}")
|
||||
continue
|
||||
|
||||
if np_embeddings:
|
||||
@ -283,7 +287,7 @@ def merge_embeddings_by_group(unique_id: str, group_name: str) -> Dict:
|
||||
elif isinstance(emb, np.ndarray):
|
||||
np_embeddings.append(emb)
|
||||
else:
|
||||
print(f"Warning: Cannot convert embedding to numpy from file {filename_stem}")
|
||||
logger.warning(f"Warning: Cannot convert embedding to numpy from file {filename_stem}")
|
||||
continue
|
||||
|
||||
if np_embeddings:
|
||||
@ -297,7 +301,7 @@ def merge_embeddings_by_group(unique_id: str, group_name: str) -> Dict:
|
||||
return result
|
||||
except Exception as e:
|
||||
result["error"] = f"Failed to merge embedding tensors: {str(e)}"
|
||||
print(f"Error merging embedding tensors: {str(e)}")
|
||||
logger.error(f"Error merging embedding tensors: {str(e)}")
|
||||
return result
|
||||
|
||||
# Create merged embedding data structure
|
||||
@ -327,7 +331,7 @@ def merge_embeddings_by_group(unique_id: str, group_name: str) -> Dict:
|
||||
|
||||
except Exception as e:
|
||||
result["error"] = f"Embedding merging failed: {str(e)}"
|
||||
print(f"Error merging embeddings for group {group_name}: {str(e)}")
|
||||
logger.error(f"Error merging embeddings for group {group_name}: {str(e)}")
|
||||
|
||||
return result
|
||||
|
||||
@ -425,11 +429,11 @@ def cleanup_dataset_group(unique_id: str, group_name: str) -> bool:
|
||||
if os.path.exists(dataset_group_dir):
|
||||
import shutil
|
||||
shutil.rmtree(dataset_group_dir)
|
||||
print(f"Cleaned up dataset group: {group_name}")
|
||||
logger.info(f"Cleaned up dataset group: {group_name}")
|
||||
return True
|
||||
else:
|
||||
return True # Nothing to clean up
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error cleaning up dataset group {group_name}: {str(e)}")
|
||||
logger.error(f"Error cleaning up dataset group {group_name}: {str(e)}")
|
||||
return False
|
||||
|
||||
@ -6,8 +6,12 @@ New implementation with per-file processing and group merging.
|
||||
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, List
|
||||
|
||||
# Configure logger
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
# Import new modules
|
||||
from utils.file_manager import (
|
||||
ensure_directories, sync_files_to_group, cleanup_orphaned_files,
|
||||
@ -37,13 +41,13 @@ async def download_dataset_files(unique_id: str, files: Dict[str, List[str]], in
|
||||
if not files:
|
||||
return {}
|
||||
|
||||
print(f"Starting {'incremental' if incremental_mode else 'full'} file processing for project: {unique_id}")
|
||||
logger.info(f"Starting {'incremental' if incremental_mode else 'full'} file processing for project: {unique_id}")
|
||||
|
||||
# Ensure project directories exist
|
||||
ensure_directories(unique_id)
|
||||
|
||||
# Step 1: Sync files to group directories
|
||||
print("Step 1: Syncing files to group directories...")
|
||||
logger.info("Step 1: Syncing files to group directories...")
|
||||
synced_files, failed_files = sync_files_to_group(unique_id, files, incremental_mode)
|
||||
|
||||
# Step 2: Detect changes and cleanup orphaned files (only in non-incremental mode)
|
||||
@ -52,14 +56,14 @@ async def download_dataset_files(unique_id: str, files: Dict[str, List[str]], in
|
||||
|
||||
# Only cleanup orphaned files in non-incremental mode or when files are explicitly removed
|
||||
if not incremental_mode and any(changes["removed"].values()):
|
||||
print("Step 2: Cleaning up orphaned files...")
|
||||
logger.info("Step 2: Cleaning up orphaned files...")
|
||||
removed_files = cleanup_orphaned_files(unique_id, changes)
|
||||
print(f"Removed orphaned files: {removed_files}")
|
||||
logger.info(f"Removed orphaned files: {removed_files}")
|
||||
elif incremental_mode:
|
||||
print("Step 2: Skipping cleanup in incremental mode to preserve existing files")
|
||||
logger.info("Step 2: Skipping cleanup in incremental mode to preserve existing files")
|
||||
|
||||
# Step 3: Process individual files
|
||||
print("Step 3: Processing individual files...")
|
||||
logger.info("Step 3: Processing individual files...")
|
||||
processed_files_by_group = {}
|
||||
processing_results = {}
|
||||
|
||||
@ -75,12 +79,12 @@ async def download_dataset_files(unique_id: str, files: Dict[str, List[str]], in
|
||||
|
||||
# Skip if file doesn't exist (might be remote file that failed to download)
|
||||
if not os.path.exists(local_path) and not file_path.startswith(('http://', 'https://')):
|
||||
print(f"Skipping non-existent file: {filename}")
|
||||
logger.warning(f"Skipping non-existent file: {filename}")
|
||||
continue
|
||||
|
||||
# Check if already processed
|
||||
if check_file_already_processed(unique_id, group_name, filename):
|
||||
print(f"Skipping already processed file: {filename}")
|
||||
logger.info(f"Skipping already processed file: {filename}")
|
||||
processed_files_by_group[group_name].append(filename)
|
||||
processing_results[group_name].append({
|
||||
"filename": filename,
|
||||
@ -89,18 +93,18 @@ async def download_dataset_files(unique_id: str, files: Dict[str, List[str]], in
|
||||
continue
|
||||
|
||||
# Process the file
|
||||
print(f"Processing file: {filename} (group: {group_name})")
|
||||
logger.info(f"Processing file: {filename} (group: {group_name})")
|
||||
result = await process_single_file(unique_id, group_name, filename, file_path, local_path)
|
||||
processing_results[group_name].append(result)
|
||||
|
||||
if result["success"]:
|
||||
processed_files_by_group[group_name].append(filename)
|
||||
print(f" Successfully processed {filename}")
|
||||
logger.info(f" Successfully processed {filename}")
|
||||
else:
|
||||
print(f" Failed to process {filename}: {result['error']}")
|
||||
logger.error(f" Failed to process {filename}: {result['error']}")
|
||||
|
||||
# Step 4: Merge results by group
|
||||
print("Step 4: Merging results by group...")
|
||||
logger.info("Step 4: Merging results by group...")
|
||||
merge_results = {}
|
||||
|
||||
for group_name in processed_files_by_group.keys():
|
||||
@ -108,20 +112,20 @@ async def download_dataset_files(unique_id: str, files: Dict[str, List[str]], in
|
||||
group_files = get_group_files_list(unique_id, group_name)
|
||||
|
||||
if group_files:
|
||||
print(f"Merging group: {group_name} with {len(group_files)} files")
|
||||
logger.info(f"Merging group: {group_name} with {len(group_files)} files")
|
||||
merge_result = merge_all_data_by_group(unique_id, group_name)
|
||||
merge_results[group_name] = merge_result
|
||||
|
||||
if merge_result["success"]:
|
||||
print(f" Successfully merged group {group_name}")
|
||||
logger.info(f" Successfully merged group {group_name}")
|
||||
else:
|
||||
print(f" Failed to merge group {group_name}: {merge_result['errors']}")
|
||||
logger.error(f" Failed to merge group {group_name}: {merge_result['errors']}")
|
||||
|
||||
# Step 5: Save processing log
|
||||
print("Step 5: Saving processing log...")
|
||||
logger.info("Step 5: Saving processing log...")
|
||||
await save_processing_log(unique_id, files, synced_files, processing_results, merge_results)
|
||||
|
||||
print(f"File processing completed for project: {unique_id}")
|
||||
logger.info(f"File processing completed for project: {unique_id}")
|
||||
return processed_files_by_group
|
||||
|
||||
|
||||
@ -156,9 +160,9 @@ async def save_processing_log(
|
||||
try:
|
||||
with open(log_file_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(log_data, f, ensure_ascii=False, indent=2)
|
||||
print(f"Processing log saved to: {log_file_path}")
|
||||
logger.info(f"Processing log saved to: {log_file_path}")
|
||||
except Exception as e:
|
||||
print(f"Error saving processing log: {str(e)}")
|
||||
logger.error(f"Error saving processing log: {str(e)}")
|
||||
|
||||
|
||||
def generate_dataset_structure(unique_id: str) -> str:
|
||||
|
||||
@ -5,8 +5,12 @@ Excel and CSV file processor for converting data to document.txt and pagination.
|
||||
|
||||
import os
|
||||
import pandas as pd
|
||||
import logging
|
||||
from typing import List, Dict, Any, Tuple
|
||||
|
||||
# 配置日志
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
|
||||
def read_excel_sheets(file_path: str) -> Dict[str, List[Dict[str, Any]]]:
|
||||
"""
|
||||
@ -48,16 +52,16 @@ def read_excel_sheets(file_path: str) -> Dict[str, List[Dict[str, Any]]]:
|
||||
sheet_data.append(row_dict)
|
||||
|
||||
sheets_data[sheet_name] = sheet_data
|
||||
print(f"读取 Excel sheet '{sheet_name}': {len(sheet_data)} 行数据")
|
||||
logger.info(f"读取 Excel sheet '{sheet_name}': {len(sheet_data)} 行数据")
|
||||
|
||||
except Exception as e:
|
||||
print(f"读取 Excel sheet '{sheet_name}' 失败: {str(e)}")
|
||||
logger.error(f"读取 Excel sheet '{sheet_name}' 失败: {str(e)}")
|
||||
continue
|
||||
|
||||
return sheets_data
|
||||
|
||||
except Exception as e:
|
||||
print(f"读取 Excel 文件失败: {str(e)}")
|
||||
logger.error(f"读取 Excel 文件失败: {str(e)}")
|
||||
return {}
|
||||
|
||||
|
||||
@ -105,11 +109,11 @@ def read_csv_file(file_path: str, encoding: str = 'utf-8') -> List[Dict[str, Any
|
||||
if any(v.strip() for v in row_dict.values()):
|
||||
csv_data.append(row_dict)
|
||||
|
||||
print(f"读取 CSV 文件: {len(csv_data)} 行数据")
|
||||
logger.info(f"读取 CSV 文件: {len(csv_data)} 行数据")
|
||||
return csv_data
|
||||
|
||||
except Exception as e:
|
||||
print(f"读取 CSV 文件失败: {str(e)}")
|
||||
logger.error(f"读取 CSV 文件失败: {str(e)}")
|
||||
return []
|
||||
|
||||
|
||||
|
||||
@ -9,7 +9,9 @@ import aiohttp
|
||||
from qwen_agent.llm.schema import ASSISTANT, FUNCTION
|
||||
from qwen_agent.llm.oai import TextChatAtOAI
|
||||
from fastapi import HTTPException
|
||||
from utils.logger import logger
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
# 创建全局线程池执行器,用于执行同步的HTTP调用
|
||||
thread_pool = ThreadPoolExecutor(max_workers=10)
|
||||
@ -61,9 +63,9 @@ def get_versioned_filename(upload_dir: str, name_without_ext: str, file_extensio
|
||||
file_to_delete = os.path.join(upload_dir, filename)
|
||||
try:
|
||||
os.remove(file_to_delete)
|
||||
print(f"已删除文件: {file_to_delete}")
|
||||
logger.info(f"已删除文件: {file_to_delete}")
|
||||
except OSError as e:
|
||||
print(f"删除文件失败 {file_to_delete}: {e}")
|
||||
logger.error(f"删除文件失败 {file_to_delete}: {e}")
|
||||
|
||||
# 确定下一个版本号
|
||||
if existing_versions:
|
||||
@ -301,7 +303,7 @@ def create_project_directory(dataset_ids: Optional[List[str]], bot_id: str, robo
|
||||
from utils.multi_project_manager import create_robot_project
|
||||
return create_robot_project(dataset_ids, bot_id)
|
||||
except Exception as e:
|
||||
print(f"Error creating project directory: {e}")
|
||||
logger.error(f"Error creating project directory: {e}")
|
||||
return None
|
||||
|
||||
|
||||
@ -392,7 +394,7 @@ def _sync_call_guideline_llm(llm_config, messages) -> str:
|
||||
return str(response) if response else ""
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error calling guideline LLM: {e}")
|
||||
logger.error(f"Error calling guideline LLM: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
@ -414,7 +416,7 @@ async def call_guideline_llm(chat_history: str, guidelines_text: str, terms:str,
|
||||
with open('./prompt/guideline_prompt.md', 'r', encoding='utf-8') as f:
|
||||
guideline_template = f.read()
|
||||
except Exception as e:
|
||||
print(f"Error reading guideline prompt template: {e}")
|
||||
logger.error(f"Error reading guideline prompt template: {e}")
|
||||
return ""
|
||||
|
||||
# 替换模板中的占位符
|
||||
@ -439,7 +441,7 @@ async def call_guideline_llm(chat_history: str, guidelines_text: str, terms:str,
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error calling guideline LLM: {e}")
|
||||
logger.error(f"Error calling guideline LLM: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
@ -466,28 +468,43 @@ async def process_guideline_batch(
|
||||
model_server: str
|
||||
) -> str:
|
||||
"""处理单个guideline批次"""
|
||||
try:
|
||||
# 调用LLM分析这批guidelines
|
||||
batch_guidelines_text = "\n".join(guidelines_batch)
|
||||
batch_analysis = await call_guideline_llm(chat_history, batch_guidelines_text, terms, model_name, api_key, model_server)
|
||||
max_retries = 3
|
||||
|
||||
# 从响应中提取 ```json 和 ``` 包裹的内容
|
||||
json_pattern = r'```json\s*\n(.*?)\n```'
|
||||
json_matches = re.findall(json_pattern, batch_analysis, re.DOTALL)
|
||||
|
||||
if json_matches:
|
||||
try:
|
||||
# 解析第一个找到的JSON对象
|
||||
json_data = json.loads(json_matches[0])
|
||||
return json_data # 返回解析后的JSON对象
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error parsing JSON from guideline analysis: {e}")
|
||||
return batch_analysis # 如果JSON解析失败,返回原始文本
|
||||
else:
|
||||
return batch_analysis # 如果没有找到JSON格式,返回原始文本
|
||||
except Exception as e:
|
||||
print(f"Error processing guideline batch: {e}")
|
||||
return ""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
# 调用LLM分析这批guidelines
|
||||
batch_guidelines_text = "\n".join(guidelines_batch)
|
||||
logger.info(f"Start processed guideline batch on attempt {attempt + 1}")
|
||||
batch_analysis = await call_guideline_llm(chat_history, batch_guidelines_text, terms, model_name, api_key, model_server)
|
||||
|
||||
# 从响应中提取 ```json 和 ``` 包裹的内容
|
||||
json_pattern = r'```json\s*\n(.*?)\n```'
|
||||
json_matches = re.findall(json_pattern, batch_analysis, re.DOTALL)
|
||||
|
||||
if json_matches:
|
||||
try:
|
||||
# 解析第一个找到的JSON对象
|
||||
json_data = json.loads(json_matches[0])
|
||||
logger.info(f"Successfully processed guideline batch on attempt {attempt + 1}")
|
||||
return json_data # 返回解析后的JSON对象
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Error parsing JSON from guideline analysis on attempt {attempt + 1}: {e}")
|
||||
if attempt == max_retries - 1:
|
||||
return batch_analysis # 最后一次尝试失败,返回原始文本
|
||||
continue
|
||||
else:
|
||||
logger.warning(f"No JSON format found in guideline analysis on attempt {attempt + 1}")
|
||||
if attempt == max_retries - 1:
|
||||
return batch_analysis # 最后一次尝试失败,返回原始文本
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing guideline batch on attempt {attempt + 1}: {e}")
|
||||
if attempt == max_retries - 1:
|
||||
return "" # 最后一次尝试失败,返回空字符串
|
||||
|
||||
# 这里不应该到达,但为了完整性
|
||||
return ""
|
||||
|
||||
|
||||
def extract_block_from_system_prompt(system_prompt: Optional[str]) -> tuple[str, List[Dict[str, Any]], List[Dict[str, Any]]]:
|
||||
@ -519,7 +536,7 @@ def extract_block_from_system_prompt(system_prompt: Optional[str]) -> tuple[str,
|
||||
guidelines_list.extend(guidelines)
|
||||
blocks_to_remove.append(match.group(0))
|
||||
except Exception as e:
|
||||
print(f"Error parsing guidelines: {e}")
|
||||
logger.error(f"Error parsing guidelines: {e}")
|
||||
|
||||
elif block_type == 'terms':
|
||||
try:
|
||||
@ -527,7 +544,7 @@ def extract_block_from_system_prompt(system_prompt: Optional[str]) -> tuple[str,
|
||||
terms_list.extend(terms)
|
||||
blocks_to_remove.append(match.group(0))
|
||||
except Exception as e:
|
||||
print(f"Error parsing terms: {e}")
|
||||
logger.error(f"Error parsing terms: {e}")
|
||||
|
||||
# 从system_prompt中移除这些已解析的块
|
||||
cleaned_prompt = system_prompt
|
||||
|
||||
@ -8,6 +8,9 @@ import json
|
||||
import shutil
|
||||
from typing import Dict, List, Set, Tuple
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
|
||||
def get_existing_files(unique_id: str) -> Dict[str, Set[str]]:
|
||||
@ -204,11 +207,11 @@ def cleanup_orphaned_files(unique_id: str, changes: Dict) -> Dict[str, List[str]
|
||||
removed_files[group_name].append(f"processed: {Path(filename).stem}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error cleaning up {filename}: {str(e)}")
|
||||
logger.error(f"Error cleaning up {filename}: {str(e)}")
|
||||
|
||||
# Handle completely removed groups
|
||||
for group_name in changes.get("removed_groups", set()):
|
||||
print(f"Cleaning up completely removed group: {group_name}")
|
||||
logger.info(f"Cleaning up completely removed group: {group_name}")
|
||||
removed_files[group_name] = []
|
||||
|
||||
try:
|
||||
@ -230,10 +233,10 @@ def cleanup_orphaned_files(unique_id: str, changes: Dict) -> Dict[str, List[str]
|
||||
shutil.rmtree(dataset_group_dir)
|
||||
removed_files[group_name].append("dataset group directory")
|
||||
|
||||
print(f"Completely removed group '{group_name}' and all its data")
|
||||
logger.info(f"Completely removed group '{group_name}' and all its data")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error cleaning up group {group_name}: {str(e)}")
|
||||
logger.error(f"Error cleaning up group {group_name}: {str(e)}")
|
||||
|
||||
return removed_files
|
||||
|
||||
|
||||
@ -10,9 +10,13 @@ import aiohttp
|
||||
import shutil
|
||||
import zipfile
|
||||
import tempfile
|
||||
import logging
|
||||
from typing import Dict, List, Optional
|
||||
from pathlib import Path
|
||||
|
||||
# 配置日志
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
|
||||
async def download_file(url: str, destination_path: str) -> bool:
|
||||
"""Download file from URL asynchronously"""
|
||||
@ -25,10 +29,10 @@ async def download_file(url: str, destination_path: str) -> bool:
|
||||
await f.write(chunk)
|
||||
return True
|
||||
else:
|
||||
print(f"Failed to download {url}, status code: {response.status}")
|
||||
logger.error(f"Failed to download {url}, status code: {response.status}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"Error downloading {url}: {str(e)}")
|
||||
logger.error(f"Error downloading {url}: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
@ -45,11 +49,11 @@ def remove_file_or_directory(path: str):
|
||||
os.remove(path)
|
||||
elif os.path.isdir(path):
|
||||
shutil.rmtree(path)
|
||||
print(f"Removed: {path}")
|
||||
logger.info(f"Removed: {path}")
|
||||
else:
|
||||
print(f"Path does not exist: {path}")
|
||||
logger.warning(f"Path does not exist: {path}")
|
||||
except Exception as e:
|
||||
print(f"Error removing {path}: {str(e)}")
|
||||
logger.error(f"Error removing {path}: {str(e)}")
|
||||
|
||||
|
||||
def extract_zip_file(zip_path: str, extract_dir: str) -> List[str]:
|
||||
@ -65,11 +69,11 @@ def extract_zip_file(zip_path: str, extract_dir: str) -> List[str]:
|
||||
if file.lower().endswith(('.txt', '.md', '.xlsx', '.xls', '.csv')):
|
||||
extracted_files.append(os.path.join(root, file))
|
||||
|
||||
print(f"Extracted {len(extracted_files)} txt/md/xlsx/csv files from {zip_path}")
|
||||
logger.info(f"Extracted {len(extracted_files)} txt/md/xlsx/csv files from {zip_path}")
|
||||
return extracted_files
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error extracting zip file {zip_path}: {str(e)}")
|
||||
logger.error(f"Error extracting zip file {zip_path}: {str(e)}")
|
||||
return []
|
||||
|
||||
|
||||
@ -110,7 +114,7 @@ def load_processed_files_log(unique_id: str) -> Dict[str, Dict]:
|
||||
with open(log_file, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
print(f"Error loading processed files log: {e}")
|
||||
logger.error(f"Error loading processed files log: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
@ -123,7 +127,7 @@ def save_processed_files_log(unique_id: str, processed_log: Dict[str, Dict]):
|
||||
with open(log_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(processed_log, f, ensure_ascii=False, indent=2)
|
||||
except Exception as e:
|
||||
print(f"Error saving processed files log: {e}")
|
||||
logger.error(f"Error saving processed files log: {e}")
|
||||
|
||||
|
||||
def get_processing_log(unique_id: str) -> Dict:
|
||||
@ -135,7 +139,7 @@ def get_processing_log(unique_id: str) -> Dict:
|
||||
with open(log_file, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
print(f"Error loading processing log: {e}")
|
||||
logger.error(f"Error loading processing log: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
@ -148,7 +152,7 @@ def save_project_status(unique_id: str, status: Dict):
|
||||
with open(status_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(status, f, ensure_ascii=False, indent=2)
|
||||
except Exception as e:
|
||||
print(f"Error saving project status: {e}")
|
||||
logger.error(f"Error saving project status: {e}")
|
||||
|
||||
|
||||
def load_project_status(unique_id: str) -> Dict:
|
||||
@ -160,7 +164,7 @@ def load_project_status(unique_id: str) -> Dict:
|
||||
with open(status_file, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except Exception as e:
|
||||
print(f"Error loading project status: {e}")
|
||||
logger.error(f"Error loading project status: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
@ -212,7 +216,7 @@ def update_file_processing_status(unique_id: str, group_name: str, filename: str
|
||||
json.dump(file_status, f, ensure_ascii=False, indent=2)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error updating file processing status: {e}")
|
||||
logger.error(f"Error updating file processing status: {e}")
|
||||
|
||||
|
||||
def get_file_processing_status(unique_id: str, group_name: str = None, filename: str = None) -> Dict:
|
||||
@ -240,7 +244,7 @@ def get_file_processing_status(unique_id: str, group_name: str = None, filename:
|
||||
return file_status
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error getting file processing status: {e}")
|
||||
logger.error(f"Error getting file processing status: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
@ -254,7 +258,7 @@ def calculate_directory_size(directory_path: str) -> int:
|
||||
if os.path.exists(file_path):
|
||||
total_size += os.path.getsize(file_path)
|
||||
except Exception as e:
|
||||
print(f"Error calculating directory size: {e}")
|
||||
logger.error(f"Error calculating directory size: {e}")
|
||||
|
||||
return total_size
|
||||
|
||||
|
||||
32
utils/log_util/context.py
Normal file
32
utils/log_util/context.py
Normal file
@ -0,0 +1,32 @@
|
||||
# author: askfiy
|
||||
|
||||
import contextvars
|
||||
|
||||
|
||||
class GlobalContext:
|
||||
def __init__(self):
|
||||
super().__setattr__('_contextvar', contextvars.ContextVar(f"{__class__.__name__}_g"))
|
||||
|
||||
def __getattr__(self, k: str):
|
||||
ctx = self._contextvar.get()
|
||||
return ctx[k]
|
||||
|
||||
def __setattr__(self, k: str, v):
|
||||
try:
|
||||
ctx = self._contextvar.get()
|
||||
except LookupError:
|
||||
ctx = {}
|
||||
|
||||
ctx.update({k: v})
|
||||
self._contextvar.set(ctx)
|
||||
|
||||
def get_context(self):
|
||||
return self._contextvar.get()
|
||||
|
||||
def update_context(self, ctx):
|
||||
self._contextvar.set(ctx)
|
||||
|
||||
|
||||
g = GlobalContext()
|
||||
|
||||
|
||||
32
utils/log_util/decorator.py
Normal file
32
utils/log_util/decorator.py
Normal file
@ -0,0 +1,32 @@
|
||||
import time
|
||||
import logging
|
||||
from functools import wraps
|
||||
from functools import update_wrapper
|
||||
|
||||
from .context import g
|
||||
|
||||
|
||||
app_logger = logging.getLogger('app')
|
||||
|
||||
|
||||
def safe_network(func):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
while True:
|
||||
try:
|
||||
return func(*args, **kwargs)
|
||||
except Exception as exc:
|
||||
app_logger.error(exc)
|
||||
time.sleep(1)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def copy_threading_context(func):
|
||||
main_thridng_g_ctx = g.get_context()
|
||||
|
||||
def wrapper(*args, **kwargs):
|
||||
g.update_context(main_thridng_g_ctx)
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return update_wrapper(wrapper, func)
|
||||
93
utils/log_util/logger.py
Normal file
93
utils/log_util/logger.py
Normal file
@ -0,0 +1,93 @@
|
||||
import logging
|
||||
import uuid
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
from fastapi import Request, FastAPI
|
||||
from starlette.responses import JSONResponse
|
||||
from starlette.requests import HTTPConnection
|
||||
from typing import Callable, Awaitable
|
||||
|
||||
from .context import g
|
||||
|
||||
|
||||
def add_request_routes(app: FastAPI):
|
||||
@app.middleware("http")
|
||||
async def before_request(request: Request, call_next: Callable[[Request], Awaitable[JSONResponse]]):
|
||||
|
||||
# 先从header中获取X-Trace-Id,如果没有则生成新的
|
||||
trace_id = request.headers.get('X-Trace-Id')
|
||||
if not trace_id:
|
||||
trace_id = "generate_" + str(uuid.uuid4())
|
||||
|
||||
# user_id = "未知的 user_id"
|
||||
|
||||
g.trace_id = trace_id
|
||||
# g.user_id = user_id
|
||||
response = await call_next(request)
|
||||
|
||||
response.headers['X-Trace-Id'] = g.trace_id
|
||||
return response
|
||||
|
||||
|
||||
class Formatter(logging.Formatter):
|
||||
def formatTime(self, record, datefmt=None):
|
||||
# 将时间戳转换为datetime对象
|
||||
dt = datetime.fromtimestamp(record.created)
|
||||
# 格式化时间戳到毫秒
|
||||
if datefmt:
|
||||
s = dt.strftime(datefmt)
|
||||
# 去除微秒的后三位,保留毫秒部分
|
||||
s = s[:-3]
|
||||
else:
|
||||
# 去除微秒的后三位,保留毫秒部分
|
||||
s = dt.strftime("%H:%M:%S.%f")[:-3]
|
||||
return s
|
||||
|
||||
def format(self, record):
|
||||
# 处理 trace_id
|
||||
if not hasattr(record, "trace_id"):
|
||||
record.trace_id = getattr(g, "trace_id")
|
||||
# 处理 user_id
|
||||
# if not hasattr(record, "user_id"):
|
||||
# record.user_id = getattr(g, "user_id")
|
||||
|
||||
# 格式化时间戳
|
||||
record.timestamp = self.formatTime(record, self.datefmt)
|
||||
|
||||
return super().format(record)
|
||||
|
||||
|
||||
# 这里注册session 上下文追踪一次就可以了
|
||||
def init_logger_once(name,level):
|
||||
logger = logging.getLogger(name)
|
||||
logger.setLevel(level=level)
|
||||
formatter = Formatter("%(timestamp)s | %(levelname)-5s | %(trace_id)s | %(name)s:%(funcName)s:%(lineno)s - %(message)s", datefmt='%Y-%m-%d %H:%M:%S.%f')
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(formatter)
|
||||
logger.addHandler(handler)
|
||||
|
||||
|
||||
def init_with_fastapi(app,level=logging.INFO):
|
||||
init_logger_once("app",level)
|
||||
add_request_routes(app)
|
||||
|
||||
def info(message, *args, **kwargs):
|
||||
app_logger = logging.getLogger('app')
|
||||
app_logger.info(message, *args, **kwargs)
|
||||
|
||||
def debug(message, *args, **kwargs):
|
||||
app_logger = logging.getLogger('app')
|
||||
app_logger.debug(message, *args, **kwargs)
|
||||
|
||||
def warning(message, *args, **kwargs):
|
||||
app_logger = logging.getLogger('app')
|
||||
app_logger.warning(message, *args, **kwargs)
|
||||
|
||||
def error(message, *args, **kwargs):
|
||||
app_logger = logging.getLogger('app')
|
||||
app_logger.error(message, *args, **kwargs)
|
||||
|
||||
def critical(message, *args, **kwargs):
|
||||
app_logger = logging.getLogger('app')
|
||||
app_logger.critical(message, *args, **kwargs)
|
||||
@ -6,10 +6,14 @@
|
||||
import os
|
||||
import shutil
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Optional
|
||||
from datetime import datetime
|
||||
|
||||
# Configure logger
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
from utils.file_utils import get_document_preview
|
||||
|
||||
|
||||
@ -177,11 +181,11 @@ def copy_dataset_folder(source_project_id: str, target_dataset_dir: Path, folder
|
||||
shutil.copytree(source_folder, target_folder)
|
||||
result["success"] = True
|
||||
|
||||
print(f" Copied: {source_folder} -> {target_folder}")
|
||||
logger.info(f" Copied: {source_folder} -> {target_folder}")
|
||||
|
||||
except Exception as e:
|
||||
result["error"] = str(e)
|
||||
print(f" Error copying {folder_name}: {str(e)}")
|
||||
logger.error(f" Error copying {folder_name}: {str(e)}")
|
||||
|
||||
return result
|
||||
|
||||
@ -237,7 +241,7 @@ def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: L
|
||||
if item_path.is_dir():
|
||||
doc_dirs.append(item)
|
||||
except Exception as e:
|
||||
print(f"Error listing dataset directories: {str(e)}")
|
||||
logger.error(f"Error listing dataset directories: {str(e)}")
|
||||
|
||||
if not doc_dirs:
|
||||
readme_content += "No document directories found.\n"
|
||||
@ -292,7 +296,7 @@ def generate_robot_readme(robot_id: str, dataset_ids: List[str], copy_results: L
|
||||
with open(readme_path, 'w', encoding='utf-8') as f:
|
||||
f.write(readme_content)
|
||||
|
||||
print(f"Generated README: {readme_path}")
|
||||
logger.info(f"Generated README: {readme_path}")
|
||||
return str(readme_path)
|
||||
|
||||
|
||||
@ -314,14 +318,14 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
|
||||
|
||||
# 如果机器人项目不存在,需要创建
|
||||
if not robot_dir.exists():
|
||||
print(f"Robot project does not exist, need to create: {bot_id}")
|
||||
return True
|
||||
logger.info(f"Robot project does not exist, need to create: {bot_id}")
|
||||
return True
|
||||
|
||||
# 检查机器人项目的配置信息
|
||||
config_file = robot_dir / "robot_config.json"
|
||||
if not config_file.exists():
|
||||
print(f"Robot config file not found, need to rebuild: {bot_id}")
|
||||
return True
|
||||
logger.info(f"Robot config file not found, need to rebuild: {bot_id}")
|
||||
return True
|
||||
|
||||
# 读取配置信息
|
||||
try:
|
||||
@ -329,8 +333,8 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
|
||||
config = json.load(f)
|
||||
cached_dataset_ids = set(config.get("dataset_ids", []))
|
||||
except Exception as e:
|
||||
print(f"Error reading robot config: {e}, need to rebuild")
|
||||
return True
|
||||
logger.error(f"Error reading robot config: {e}, need to rebuild")
|
||||
return True
|
||||
|
||||
# 检查dataset_ids是否有变化
|
||||
current_dataset_ids = set(dataset_ids)
|
||||
@ -338,14 +342,14 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
|
||||
# 如果有新增的dataset_id
|
||||
new_ids = current_dataset_ids - cached_dataset_ids
|
||||
if new_ids:
|
||||
print(f"Found new dataset_ids: {new_ids}, need to rebuild")
|
||||
return True
|
||||
logger.info(f"Found new dataset_ids: {new_ids}, need to rebuild")
|
||||
return True
|
||||
|
||||
# 如果有删除的dataset_id
|
||||
removed_ids = cached_dataset_ids - current_dataset_ids
|
||||
if removed_ids:
|
||||
print(f"Removed dataset_ids: {removed_ids}, need to rebuild")
|
||||
return True
|
||||
logger.info(f"Removed dataset_ids: {removed_ids}, need to rebuild")
|
||||
return True
|
||||
|
||||
# 获取机器人项目的最后修改时间
|
||||
robot_mod_time = robot_dir.stat().st_mtime
|
||||
@ -355,17 +359,17 @@ def should_rebuild_robot_project(dataset_ids: List[str], bot_id: str) -> bool:
|
||||
log_file = Path("projects") / "data" / source_project_id / "processing_log.json"
|
||||
|
||||
if not log_file.exists():
|
||||
print(f"Processing log file not found for project {source_project_id}, will rebuild")
|
||||
return True
|
||||
logger.info(f"Processing log file not found for project {source_project_id}, will rebuild")
|
||||
return True
|
||||
|
||||
log_mod_time = log_file.stat().st_mtime
|
||||
|
||||
# 如果任何一个processing_log.json文件比机器人项目新,需要重建
|
||||
if log_mod_time > robot_mod_time:
|
||||
print(f"Processing log updated for project {source_project_id}, need to rebuild")
|
||||
return True
|
||||
logger.info(f"Processing log updated for project {source_project_id}, need to rebuild")
|
||||
return True
|
||||
|
||||
print(f"Robot project {bot_id} is up to date, no rebuild needed")
|
||||
logger.info(f"Robot project {bot_id} is up to date, no rebuild needed")
|
||||
return False
|
||||
|
||||
|
||||
@ -381,12 +385,12 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
|
||||
Returns:
|
||||
str: 机器人项目目录路径
|
||||
"""
|
||||
print(f"Creating robot project: {bot_id} from sources: {dataset_ids}")
|
||||
logger.info(f"Creating robot project: {bot_id} from sources: {dataset_ids}")
|
||||
|
||||
# 检查是否需要重建
|
||||
if not force_rebuild and not should_rebuild_robot_project(dataset_ids, bot_id):
|
||||
robot_dir = Path("projects") / "robot" / bot_id
|
||||
print(f"Using existing robot project: {robot_dir}")
|
||||
logger.info(f"Using existing robot project: {robot_dir}")
|
||||
return str(robot_dir)
|
||||
|
||||
# 创建机器人目录结构
|
||||
@ -395,8 +399,8 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
|
||||
|
||||
# 清理已存在的目录(如果需要)
|
||||
if robot_dir.exists():
|
||||
print(f"Robot directory already exists, cleaning up: {robot_dir}")
|
||||
shutil.rmtree(robot_dir)
|
||||
logger.info(f"Robot directory already exists, cleaning up: {robot_dir}")
|
||||
shutil.rmtree(robot_dir)
|
||||
|
||||
robot_dir.mkdir(parents=True, exist_ok=True)
|
||||
dataset_dir.mkdir(parents=True, exist_ok=True)
|
||||
@ -405,19 +409,19 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
|
||||
|
||||
# 遍历每个源项目
|
||||
for source_project_id in dataset_ids:
|
||||
print(f"\nProcessing source project: {source_project_id}")
|
||||
logger.info(f"\nProcessing source project: {source_project_id}")
|
||||
|
||||
source_dataset_dir = Path("projects") / "data" / source_project_id / "dataset"
|
||||
|
||||
if not source_dataset_dir.exists():
|
||||
print(f" Warning: Dataset directory not found for project {source_project_id}")
|
||||
logger.warning(f" Warning: Dataset directory not found for project {source_project_id}")
|
||||
continue
|
||||
|
||||
# 获取所有子文件夹
|
||||
folders = [f for f in source_dataset_dir.iterdir() if f.is_dir()]
|
||||
|
||||
if not folders:
|
||||
print(f" Warning: No folders found in dataset directory for project {source_project_id}")
|
||||
logger.warning(f" Warning: No folders found in dataset directory for project {source_project_id}")
|
||||
continue
|
||||
|
||||
# 复制每个文件夹
|
||||
@ -442,12 +446,12 @@ def create_robot_project(dataset_ids: List[str], bot_id: str, force_rebuild: boo
|
||||
|
||||
# 统计信息
|
||||
successful_copies = sum(1 for r in copy_results if r["success"])
|
||||
print(f"\nRobot project creation completed:")
|
||||
print(f" Robot directory: {robot_dir}")
|
||||
print(f" Total folders processed: {len(copy_results)}")
|
||||
print(f" Successful copies: {successful_copies}")
|
||||
print(f" Config saved: {config_file}")
|
||||
print(f" README generated: {readme_path}")
|
||||
logger.info(f"\nRobot project creation completed:")
|
||||
logger.info(f" Robot directory: {robot_dir}")
|
||||
logger.info(f" Total folders processed: {len(copy_results)}")
|
||||
logger.info(f" Successful copies: {successful_copies}")
|
||||
logger.info(f" Config saved: {config_file}")
|
||||
logger.info(f" README generated: {readme_path}")
|
||||
|
||||
return str(robot_dir)
|
||||
|
||||
@ -513,14 +517,14 @@ def cleanup_robot_project(bot_id: str) -> bool:
|
||||
|
||||
if robot_dir.exists():
|
||||
shutil.rmtree(robot_dir)
|
||||
print(f"Cleaned up robot project: {bot_id}")
|
||||
logger.info(f"Cleaned up robot project: {bot_id}")
|
||||
return True
|
||||
else:
|
||||
print(f"Robot project does not exist: {bot_id}")
|
||||
logger.info(f"Robot project does not exist: {bot_id}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error cleaning up robot project {bot_id}: {str(e)}")
|
||||
logger.error(f"Error cleaning up robot project {bot_id}: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
@ -530,7 +534,7 @@ if __name__ == "__main__":
|
||||
test_bot_id = "test-robot-001"
|
||||
|
||||
robot_dir = create_robot_project(test_dataset_ids, test_bot_id)
|
||||
print(f"Created robot project at: {robot_dir}")
|
||||
logger.info(f"Created robot project at: {robot_dir}")
|
||||
|
||||
info = get_robot_project_info(test_bot_id)
|
||||
print(f"Robot project info: {json.dumps(info, indent=2, ensure_ascii=False)}")
|
||||
logger.info(f"Robot project info: {json.dumps(info, indent=2, ensure_ascii=False)}")
|
||||
|
||||
@ -1,8 +1,12 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import shutil
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
# 配置日志
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
def is_file_already_processed(target_file: Path, pagination_file: Path, embeddings_file: Path) -> bool:
|
||||
"""Check if a file has already been processed (document.txt, pagination.txt, and embeddings exist)"""
|
||||
if not target_file.exists():
|
||||
@ -22,22 +26,22 @@ def organize_single_project_files(unique_id: str, skip_processed=True):
|
||||
project_dir = Path("projects") / "data" / unique_id
|
||||
|
||||
if not project_dir.exists():
|
||||
print(f"Project directory not found: {project_dir}")
|
||||
logger.error(f"Project directory not found: {project_dir}")
|
||||
return
|
||||
|
||||
print(f"Organizing files for project: {unique_id} (skip_processed={skip_processed})")
|
||||
logger.info(f"Organizing files for project: {unique_id} (skip_processed={skip_processed})")
|
||||
|
||||
files_dir = project_dir / "files"
|
||||
dataset_dir = project_dir / "dataset"
|
||||
|
||||
# Check if files directory exists and has files
|
||||
if not files_dir.exists():
|
||||
print(f" No files directory found, skipping...")
|
||||
logger.info(f" No files directory found, skipping...")
|
||||
return
|
||||
|
||||
files = list(files_dir.glob("*"))
|
||||
if not files:
|
||||
print(f" Files directory is empty, skipping...")
|
||||
logger.info(f" Files directory is empty, skipping...")
|
||||
return
|
||||
|
||||
# Create dataset directory if it doesn't exist
|
||||
@ -55,10 +59,10 @@ def organize_single_project_files(unique_id: str, skip_processed=True):
|
||||
|
||||
# Check if file is already processed
|
||||
if skip_processed and is_file_already_processed(target_file, pagination_file, embeddings_file):
|
||||
print(f" Skipping already processed file: {file_path.name}")
|
||||
logger.info(f" Skipping already processed file: {file_path.name}")
|
||||
continue
|
||||
|
||||
print(f" Copying {file_path.name} -> {target_file.relative_to(project_dir)}")
|
||||
logger.info(f" Copying {file_path.name} -> {target_file.relative_to(project_dir)}")
|
||||
|
||||
# Create target directory
|
||||
target_dir.mkdir(exist_ok=True)
|
||||
@ -140,12 +144,12 @@ def organize_dataset_files():
|
||||
|
||||
# Check if files directory exists and has files
|
||||
if not files_dir.exists():
|
||||
print(f" No files directory found, skipping...")
|
||||
logger.info(f" No files directory found, skipping...")
|
||||
continue
|
||||
|
||||
files = list(files_dir.glob("*"))
|
||||
if not files:
|
||||
print(f" Files directory is empty, skipping...")
|
||||
logger.info(f" Files directory is empty, skipping...")
|
||||
continue
|
||||
|
||||
# Create dataset directory if it doesn't exist
|
||||
@ -159,7 +163,7 @@ def organize_dataset_files():
|
||||
target_dir = dataset_dir / file_name_without_ext
|
||||
target_file = target_dir / "document.txt"
|
||||
|
||||
print(f" Copying {file_path.name} -> {target_file.relative_to(project_dir)}")
|
||||
logger.info(f" Copying {file_path.name} -> {target_file.relative_to(project_dir)}")
|
||||
|
||||
# Create target directory
|
||||
target_dir.mkdir(exist_ok=True)
|
||||
|
||||
@ -5,9 +5,13 @@ Project management functions for handling projects, README generation, and statu
|
||||
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, List, Optional
|
||||
from pathlib import Path
|
||||
|
||||
# Configure logger
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
from utils.file_utils import get_document_preview, load_processed_files_log
|
||||
|
||||
|
||||
@ -145,7 +149,7 @@ This project contains processed documents and their associated embeddings for se
|
||||
if os.path.isdir(item_path):
|
||||
doc_dirs.append(item)
|
||||
except Exception as e:
|
||||
print(f"Error listing dataset directories: {str(e)}")
|
||||
logger.error(f"Error listing dataset directories: {str(e)}")
|
||||
|
||||
if not doc_dirs:
|
||||
readme_content += "No document directories found.\n"
|
||||
@ -198,10 +202,10 @@ def save_project_readme(unique_id: str):
|
||||
os.makedirs(os.path.dirname(readme_path), exist_ok=True)
|
||||
with open(readme_path, 'w', encoding='utf-8') as f:
|
||||
f.write(readme_content)
|
||||
print(f"Generated README.md for project {unique_id}")
|
||||
logger.info(f"Generated README.md for project {unique_id}")
|
||||
return readme_path
|
||||
except Exception as e:
|
||||
print(f"Error generating README for project {unique_id}: {str(e)}")
|
||||
logger.error(f"Error generating README for project {unique_id}: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
@ -264,13 +268,13 @@ def remove_project(unique_id: str) -> bool:
|
||||
if os.path.exists(project_dir):
|
||||
import shutil
|
||||
shutil.rmtree(project_dir)
|
||||
print(f"Removed project directory: {project_dir}")
|
||||
logger.info(f"Removed project directory: {project_dir}")
|
||||
return True
|
||||
else:
|
||||
print(f"Project directory not found: {project_dir}")
|
||||
logger.warning(f"Project directory not found: {project_dir}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"Error removing project {unique_id}: {str(e)}")
|
||||
logger.error(f"Error removing project {unique_id}: {str(e)}")
|
||||
return False
|
||||
|
||||
|
||||
@ -284,7 +288,7 @@ def list_projects() -> List[str]:
|
||||
return [item for item in os.listdir(projects_dir)
|
||||
if os.path.isdir(os.path.join(projects_dir, item))]
|
||||
except Exception as e:
|
||||
print(f"Error listing projects: {str(e)}")
|
||||
logger.error(f"Error listing projects: {str(e)}")
|
||||
return []
|
||||
|
||||
|
||||
|
||||
@ -6,9 +6,13 @@ Single file processing functions for handling individual files.
|
||||
import os
|
||||
import tempfile
|
||||
import zipfile
|
||||
import logging
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
from pathlib import Path
|
||||
|
||||
# Configure logger
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
from utils.file_utils import download_file
|
||||
|
||||
# Try to import excel/csv processor, but handle if dependencies are missing
|
||||
@ -18,7 +22,7 @@ try:
|
||||
)
|
||||
EXCEL_CSV_SUPPORT = True
|
||||
except ImportError as e:
|
||||
print(f"Excel/CSV processing not available: {e}")
|
||||
logger.warning(f"Excel/CSV processing not available: {e}")
|
||||
EXCEL_CSV_SUPPORT = False
|
||||
|
||||
# Fallback functions
|
||||
@ -71,7 +75,7 @@ async def process_single_file(
|
||||
# Download file if it's remote and not yet downloaded
|
||||
if original_path.startswith(('http://', 'https://')):
|
||||
if not os.path.exists(local_path):
|
||||
print(f"Downloading {original_path} -> {local_path}")
|
||||
logger.info(f"Downloading {original_path} -> {local_path}")
|
||||
success = await download_file(original_path, local_path)
|
||||
if not success:
|
||||
result["error"] = "Failed to download file"
|
||||
@ -112,11 +116,11 @@ async def process_single_file(
|
||||
|
||||
except Exception as e:
|
||||
result["error"] = f"Embedding generation failed: {str(e)}"
|
||||
print(f"Failed to generate embeddings for {filename}: {str(e)}")
|
||||
logger.error(f"Failed to generate embeddings for {filename}: {str(e)}")
|
||||
|
||||
except Exception as e:
|
||||
result["error"] = f"File processing failed: {str(e)}"
|
||||
print(f"Error processing file {filename}: {str(e)}")
|
||||
logger.error(f"Error processing file {filename}: {str(e)}")
|
||||
|
||||
return result
|
||||
|
||||
@ -167,14 +171,14 @@ async def extract_from_zip(zip_path: str, filename: str) -> Tuple[str, List[str]
|
||||
pagination_lines.extend(file_pagination)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing extracted file {file}: {str(e)}")
|
||||
logger.error(f"Error processing extracted file {file}: {str(e)}")
|
||||
|
||||
# Clean up temporary directory
|
||||
import shutil
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error extracting zip file {filename}: {str(e)}")
|
||||
logger.error(f"Error extracting zip file {filename}: {str(e)}")
|
||||
return "", []
|
||||
|
||||
return '\n\n'.join(content_parts), pagination_lines
|
||||
@ -192,7 +196,7 @@ async def extract_from_excel(file_path: str, filename: str) -> Tuple[str, List[s
|
||||
return "", []
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing Excel file {filename}: {str(e)}")
|
||||
logger.error(f"Error processing Excel file {filename}: {str(e)}")
|
||||
return "", []
|
||||
|
||||
|
||||
@ -208,7 +212,7 @@ async def extract_from_csv(file_path: str, filename: str) -> Tuple[str, List[str
|
||||
return "", []
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing CSV file {filename}: {str(e)}")
|
||||
logger.error(f"Error processing CSV file {filename}: {str(e)}")
|
||||
return "", []
|
||||
|
||||
|
||||
@ -224,7 +228,7 @@ async def extract_from_text(file_path: str, filename: str) -> Tuple[str, List[st
|
||||
return "", []
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error reading text file {filename}: {str(e)}")
|
||||
logger.error(f"Error reading text file {filename}: {str(e)}")
|
||||
return "", []
|
||||
|
||||
|
||||
@ -248,7 +252,7 @@ def generate_pagination_from_text(document_path: str, pagination_path: str) -> L
|
||||
return pagination_lines
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error generating pagination from text: {str(e)}")
|
||||
logger.error(f"Error generating pagination from text: {str(e)}")
|
||||
return []
|
||||
|
||||
|
||||
@ -273,7 +277,7 @@ async def generate_embeddings_for_file(document_path: str, embedding_path: str)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error generating embeddings: {str(e)}")
|
||||
logger.error(f"Error generating embeddings: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
@ -9,9 +9,13 @@ import multiprocessing
|
||||
import threading
|
||||
import time
|
||||
import resource
|
||||
import logging
|
||||
from typing import Dict, Any, Optional
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
# 配置日志
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
|
||||
class SystemOptimizer:
|
||||
"""系统优化器
|
||||
@ -37,9 +41,9 @@ class SystemOptimizer:
|
||||
new_limit = min(65536, hard) # 增加到65536或硬件限制
|
||||
resource.setrlimit(resource.RLIMIT_NOFILE, (new_limit, hard))
|
||||
self.original_settings['RLIMIT_NOFILE'] = (soft, hard)
|
||||
print(f"文件描述符限制从 {soft} 增加到 {new_limit}")
|
||||
logger.info(f"文件描述符限制从 {soft} 增加到 {new_limit}")
|
||||
except (ValueError, OSError) as e:
|
||||
print(f"无法设置文件描述符限制: {e}")
|
||||
logger.error(f"无法设置文件描述符限制: {e}")
|
||||
|
||||
# 2. 优化线程栈大小
|
||||
try:
|
||||
@ -47,9 +51,9 @@ class SystemOptimizer:
|
||||
new_stack = min(8 * 1024 * 1024, hard) # 8MB栈大小
|
||||
resource.setrlimit(resource.RLIMIT_STACK, (new_stack, hard))
|
||||
self.original_settings['RLIMIT_STACK'] = (soft, hard)
|
||||
print(f"线程栈大小设置为 {new_stack // (1024*1024)}MB")
|
||||
logger.info(f"线程栈大小设置为 {new_stack // (1024*1024)}MB")
|
||||
except (ValueError, OSError) as e:
|
||||
print(f"无法设置线程栈大小: {e}")
|
||||
logger.error(f"无法设置线程栈大小: {e}")
|
||||
|
||||
# 3. 环境变量优化
|
||||
env_vars = {
|
||||
@ -85,10 +89,10 @@ class SystemOptimizer:
|
||||
for key, value in env_vars.items():
|
||||
if key not in os.environ:
|
||||
os.environ[key] = value
|
||||
print(f"设置环境变量: {key}={value}")
|
||||
logger.info(f"设置环境变量: {key}={value}")
|
||||
|
||||
self.optimized = True
|
||||
print("系统优化完成")
|
||||
logger.info("系统优化完成")
|
||||
|
||||
def _backup_original_settings(self):
|
||||
"""备份原始设置"""
|
||||
@ -115,20 +119,20 @@ class SystemOptimizer:
|
||||
if not self.original_settings:
|
||||
return
|
||||
|
||||
print("恢复原始系统设置...")
|
||||
logger.info("恢复原始系统设置...")
|
||||
|
||||
# 恢复资源限制
|
||||
if 'RLIMIT_NOFILE' in self.original_settings:
|
||||
try:
|
||||
resource.setrlimit(resource.RLIMIT_NOFILE, self.original_settings['RLIMIT_NOFILE'])
|
||||
print(f"恢复文件描述符限制")
|
||||
logger.info(f"恢复文件描述符限制")
|
||||
except:
|
||||
pass
|
||||
|
||||
if 'RLIMIT_STACK' in self.original_settings:
|
||||
try:
|
||||
resource.setrlimit(resource.RLIMIT_STACK, self.original_settings['RLIMIT_STACK'])
|
||||
print(f"恢复线程栈大小")
|
||||
logger.info(f"恢复线程栈大小")
|
||||
except:
|
||||
pass
|
||||
|
||||
@ -137,13 +141,13 @@ class SystemOptimizer:
|
||||
if key.startswith('TOKENIZERS_') or key in ['PYTHONUNBUFFERED', 'PYTHONDONTWRITEBYTECODE']:
|
||||
if key in os.environ:
|
||||
del os.environ[key]
|
||||
print(f"移除环境变量: {key}")
|
||||
logger.info(f"移除环境变量: {key}")
|
||||
elif key in ['OMP_NUM_THREADS'] and value is not None:
|
||||
os.environ[key] = value
|
||||
print(f"恢复环境变量: {key}={value}")
|
||||
logger.info(f"恢复环境变量: {key}={value}")
|
||||
|
||||
self.optimized = False
|
||||
print("系统设置已恢复")
|
||||
logger.info("系统设置已恢复")
|
||||
|
||||
|
||||
class AsyncioOptimizer:
|
||||
@ -156,9 +160,9 @@ class AsyncioOptimizer:
|
||||
# 尝试使用uvloop(如果可用)
|
||||
import uvloop
|
||||
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
||||
print("使用uvloop事件循环策略")
|
||||
logger.info("使用uvloop事件循环策略")
|
||||
except ImportError:
|
||||
print("使用默认事件循环策略")
|
||||
logger.info("使用默认事件循环策略")
|
||||
|
||||
# 设置线程池大小
|
||||
cpu_count = multiprocessing.cpu_count()
|
||||
@ -166,7 +170,7 @@ class AsyncioOptimizer:
|
||||
|
||||
# 注意:不能在这里设置默认线程池执行器,因为还没有运行事件循环
|
||||
# 这个设置会在应用启动时进行
|
||||
print(f"建议线程池大小: {thread_pool_size}")
|
||||
logger.info(f"建议线程池大小: {thread_pool_size}")
|
||||
|
||||
@staticmethod
|
||||
def optimize_gunicorn_settings() -> Dict[str, Any]:
|
||||
|
||||
@ -9,10 +9,14 @@ import hashlib
|
||||
import zipfile
|
||||
import requests
|
||||
import tempfile
|
||||
import logging
|
||||
from typing import List, Optional
|
||||
from urllib.parse import urlparse
|
||||
from pathlib import Path
|
||||
|
||||
# 配置日志
|
||||
logger = logging.getLogger('app')
|
||||
|
||||
|
||||
class ZipProjectHandler:
|
||||
"""ZIP项目处理器"""
|
||||
@ -56,7 +60,7 @@ class ZipProjectHandler:
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"下载文件失败: {e}")
|
||||
logger.error(f"下载文件失败: {e}")
|
||||
return False
|
||||
|
||||
def _copy_local_file(self, local_path: str, target_path: str) -> bool:
|
||||
@ -66,7 +70,7 @@ class ZipProjectHandler:
|
||||
shutil.copy2(local_path, target_path)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"复制本地文件失败: {e}")
|
||||
logger.error(f"复制本地文件失败: {e}")
|
||||
return False
|
||||
|
||||
def _extract_zip(self, zip_path: str, extract_to: str) -> bool:
|
||||
@ -76,7 +80,7 @@ class ZipProjectHandler:
|
||||
zip_ref.extractall(extract_to)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"解压ZIP文件失败: {e}")
|
||||
logger.error(f"解压ZIP文件失败: {e}")
|
||||
return False
|
||||
|
||||
def get_project_from_zip(self, zip_url: str, unique_id: Optional[str] = None) -> Optional[str]:
|
||||
@ -91,7 +95,7 @@ class ZipProjectHandler:
|
||||
Optional[str]: 成功时返回项目目录路径,失败时返回None
|
||||
"""
|
||||
if not self._is_valid_url_or_path(zip_url):
|
||||
print(f"无效的URL或路径: {zip_url}")
|
||||
logger.error(f"无效的URL或路径: {zip_url}")
|
||||
return None
|
||||
|
||||
# 使用unique_id作为目录名,如果没有则使用url_hash
|
||||
@ -104,7 +108,7 @@ class ZipProjectHandler:
|
||||
cached_project_dir = self.projects_dir / project_dir_name
|
||||
|
||||
if cached_project_dir.exists() and not unique_id:
|
||||
print(f"使用缓存的项目目录: {cached_project_dir}")
|
||||
logger.info(f"使用缓存的项目目录: {cached_project_dir}")
|
||||
return str(cached_project_dir)
|
||||
|
||||
# 下载或复制ZIP文件
|
||||
@ -125,24 +129,24 @@ class ZipProjectHandler:
|
||||
is_url = False
|
||||
|
||||
if is_url:
|
||||
print(f"下载ZIP文件: {zip_url}")
|
||||
logger.info(f"下载ZIP文件: {zip_url}")
|
||||
if not self._download_file(zip_url, str(zip_path)):
|
||||
return None
|
||||
else:
|
||||
print(f"复制本地ZIP文件: {zip_url}")
|
||||
logger.info(f"复制本地ZIP文件: {zip_url}")
|
||||
# 解析相对路径
|
||||
local_path = Path(zip_url).resolve()
|
||||
if not self._copy_local_file(str(local_path), str(zip_path)):
|
||||
return None
|
||||
else:
|
||||
print(f"使用缓存的ZIP文件: {zip_path}")
|
||||
logger.info(f"使用缓存的ZIP文件: {zip_path}")
|
||||
|
||||
# 解压到项目目录
|
||||
print(f"解压ZIP文件到: {cached_project_dir}")
|
||||
logger.info(f"解压ZIP文件到: {cached_project_dir}")
|
||||
if not self._extract_zip(str(zip_path), str(cached_project_dir)):
|
||||
return None
|
||||
|
||||
print(f"项目准备完成: {cached_project_dir}")
|
||||
logger.info(f"项目准备完成: {cached_project_dir}")
|
||||
return str(cached_project_dir)
|
||||
|
||||
def collect_document_files(self, project_dir: str) -> List[str]:
|
||||
@ -159,7 +163,7 @@ class ZipProjectHandler:
|
||||
project_path = Path(project_dir)
|
||||
|
||||
if not project_path.exists():
|
||||
print(f"项目目录不存在: {project_dir}")
|
||||
logger.error(f"项目目录不存在: {project_dir}")
|
||||
return document_files
|
||||
|
||||
# 递归搜索所有 document.txt 文件
|
||||
@ -167,11 +171,11 @@ class ZipProjectHandler:
|
||||
if file_path.is_file():
|
||||
document_files.append(str(file_path))
|
||||
|
||||
print(f"在项目目录 {project_dir} 中找到 {len(document_files)} 个 document.txt 文件")
|
||||
logger.info(f"在项目目录 {project_dir} 中找到 {len(document_files)} 个 document.txt 文件")
|
||||
for file_path in document_files[:5]: # 只打印前5个文件路径作为示例
|
||||
print(f" - {file_path}")
|
||||
logger.info(f" - {file_path}")
|
||||
if len(document_files) > 5:
|
||||
print(f" ... 还有 {len(document_files) - 5} 个文件")
|
||||
logger.info(f" ... 还有 {len(document_files) - 5} 个文件")
|
||||
|
||||
return document_files
|
||||
|
||||
@ -182,9 +186,9 @@ class ZipProjectHandler:
|
||||
if self.cache_dir.exists():
|
||||
shutil.rmtree(self.cache_dir)
|
||||
self.cache_dir.mkdir(exist_ok=True)
|
||||
print("缓存清理完成")
|
||||
logger.info("缓存清理完成")
|
||||
except Exception as e:
|
||||
print(f"清理缓存失败: {e}")
|
||||
logger.error(f"清理缓存失败: {e}")
|
||||
|
||||
|
||||
# 全局ZIP项目处理器实例
|
||||
|
||||
Loading…
Reference in New Issue
Block a user