229 lines
8.1 KiB
Python
229 lines
8.1 KiB
Python
import re
|
||
from collections import Counter
|
||
from typing import Any
|
||
|
||
SUMMARY_LENGTH_LIMITS = {
|
||
"tldr": 2,
|
||
"short": 4,
|
||
"standard": 6,
|
||
"detailed": 10,
|
||
}
|
||
|
||
ACTION_PREFIX_PATTERNS = [
|
||
r"^(?:TODO|待办|action)[::]?\s*(.+)$",
|
||
r"^(?:需要|跟进)\s*(.+)$",
|
||
]
|
||
|
||
RISK_KEYWORDS = {
|
||
"high": ["阻塞", "blocker", "故障", "失败", "严重", "不可用"],
|
||
"medium": ["风险", "延迟", "异常", "超时", "报错"],
|
||
"low": ["提醒", "注意", "观察", "待确认"],
|
||
}
|
||
|
||
BOOL_FIELDS = ["extract_actions", "extract_risks"]
|
||
VALID_STYLES = {"daily_report", "digest", "meeting_digest", "executive"}
|
||
VALID_LENGTHS = {"tldr", "short", "standard", "detailed"}
|
||
|
||
|
||
def _clean_text(text: str) -> str:
|
||
return re.sub(r"\s+", " ", (text or "").strip())
|
||
|
||
|
||
def _normalize_key(text: str) -> str:
|
||
text = _clean_text(text).lower()
|
||
return re.sub(r"[^\w\u4e00-\u9fff]+", "", text)
|
||
|
||
|
||
def _split_sentences(text: str) -> list[str]:
|
||
raw_parts = re.split(r"[。!?;.!?;]+|\n+", text)
|
||
return [_clean_text(part) for part in raw_parts if _clean_text(part)]
|
||
|
||
|
||
def _sentence_tokens(sentence: str) -> list[str]:
|
||
return re.findall(r"[A-Za-z0-9_-]+|[\u4e00-\u9fff]{2,}", sentence.lower())
|
||
|
||
|
||
def _top_sentences(texts: list[str], limit: int) -> list[str]:
|
||
sentences: list[str] = []
|
||
for text in texts:
|
||
sentences.extend(_split_sentences(text))
|
||
|
||
if not sentences:
|
||
return []
|
||
|
||
token_counter = Counter()
|
||
for sentence in sentences:
|
||
token_counter.update(_sentence_tokens(sentence))
|
||
|
||
scored: list[tuple[int, int, str]] = []
|
||
for index, sentence in enumerate(sentences):
|
||
score = sum(token_counter[token] for token in _sentence_tokens(sentence))
|
||
scored.append((score, -index, sentence))
|
||
|
||
ranked = [sentence for _, _, sentence in sorted(scored, reverse=True)]
|
||
unique_ranked = []
|
||
seen = set()
|
||
for sentence in ranked:
|
||
key = _normalize_key(sentence)
|
||
if key and key not in seen:
|
||
seen.add(key)
|
||
unique_ranked.append(sentence)
|
||
if len(unique_ranked) >= limit:
|
||
break
|
||
return unique_ranked
|
||
|
||
|
||
def _trim_fragment(text: str, max_length: int = 80) -> str:
|
||
fragment = re.split(r"[,,。;;!?]", text, maxsplit=1)[0]
|
||
fragment = _clean_text(fragment)
|
||
return fragment[:max_length].strip()
|
||
|
||
|
||
def _extract_actions(texts: list[str]) -> list[dict[str, Any]]:
|
||
items: list[dict[str, Any]] = []
|
||
for text in texts:
|
||
for sentence in _split_sentences(text):
|
||
for pattern in ACTION_PREFIX_PATTERNS:
|
||
match = re.match(pattern, sentence, re.IGNORECASE)
|
||
if not match:
|
||
continue
|
||
task = _trim_fragment(match.group(1))
|
||
if len(task) < 2:
|
||
continue
|
||
items.append({"task": task, "owner": None, "due_at": None, "blocker": None})
|
||
break
|
||
dedup = []
|
||
seen = set()
|
||
for item in items:
|
||
key = _normalize_key(item["task"])
|
||
if key and key not in seen:
|
||
seen.add(key)
|
||
dedup.append(item)
|
||
return dedup[:10]
|
||
|
||
|
||
def _extract_risks(texts: list[str]) -> list[dict[str, Any]]:
|
||
risks: list[dict[str, Any]] = []
|
||
for text in texts:
|
||
for sentence in _split_sentences(text):
|
||
lowered = sentence.lower()
|
||
for impact, keywords in RISK_KEYWORDS.items():
|
||
matched = next((keyword for keyword in keywords if keyword.lower() in lowered), None)
|
||
if not matched:
|
||
continue
|
||
start = max(0, lowered.find(matched.lower()) - 18)
|
||
end = min(len(sentence), lowered.find(matched.lower()) + len(matched) + 30)
|
||
fragment = _clean_text(sentence[start:end])
|
||
fragment = fragment[:120]
|
||
if len(fragment) < 2:
|
||
continue
|
||
risks.append({"risk": fragment, "impact": impact, "mitigation": None})
|
||
break
|
||
dedup = []
|
||
seen = set()
|
||
for item in risks:
|
||
key = _normalize_key(item["risk"])
|
||
if key and key not in seen:
|
||
seen.add(key)
|
||
dedup.append(item)
|
||
return dedup[:10]
|
||
|
||
|
||
def _build_summary_line(sentences: list[str]) -> str:
|
||
if not sentences:
|
||
return "暂无可提炼的关键信息。"
|
||
selected = sentences[:2]
|
||
return ";".join(selected)
|
||
|
||
|
||
def build_summary(payload: dict[str, Any]) -> dict[str, Any]:
|
||
data = payload.get("data", {})
|
||
sources = data.get("sources", [])
|
||
texts = [_clean_text(source.get("content", "")) for source in sources if _clean_text(source.get("content", ""))]
|
||
length = data.get("length", "standard")
|
||
style = data.get("style", "daily_report")
|
||
limit = SUMMARY_LENGTH_LIMITS.get(length, SUMMARY_LENGTH_LIMITS["standard"])
|
||
top_sentences = _top_sentences(texts, limit)
|
||
|
||
summary_line = _build_summary_line(top_sentences)
|
||
summary_keys = {_normalize_key(sentence) for sentence in top_sentences[:2]}
|
||
detail_sentences = [sentence for sentence in top_sentences if _normalize_key(sentence) not in summary_keys]
|
||
|
||
sections = []
|
||
if detail_sentences:
|
||
if len(detail_sentences) == 1:
|
||
sections = [{"title": "Key Updates", "bullets": detail_sentences}]
|
||
else:
|
||
midpoint = max(1, len(detail_sentences) // 2)
|
||
sections = [
|
||
{"title": "Key Updates", "bullets": detail_sentences[:midpoint]},
|
||
{"title": "Notable Details", "bullets": detail_sentences[midpoint:]},
|
||
]
|
||
|
||
action_items = _extract_actions(texts) if data.get("extract_actions") else []
|
||
risk_items = _extract_risks(texts) if data.get("extract_risks") else []
|
||
|
||
markdown_lines = ["# Summary", "", "## Summary", f"- {summary_line}"]
|
||
for section in sections:
|
||
if not section["bullets"]:
|
||
continue
|
||
markdown_lines.extend(["", f"## {section['title']}"])
|
||
markdown_lines.extend(f"- {bullet}" for bullet in section["bullets"])
|
||
if action_items:
|
||
markdown_lines.extend(["", "## Action Items"])
|
||
markdown_lines.extend(f"- {item['task']}" for item in action_items)
|
||
if risk_items:
|
||
markdown_lines.extend(["", "## Risks"])
|
||
markdown_lines.extend(f"- [{item['impact']}] {item['risk']}" for item in risk_items)
|
||
|
||
schedule_payload = {
|
||
"suggested_name": "Daily Summary",
|
||
"message": "[Scheduled Task Triggered] 请立即汇总最新内容并输出结构化摘要,如有行动项和风险请一并列出,然后选择合适的通知方式发送给用户。",
|
||
}
|
||
|
||
return {
|
||
"summary": summary_line,
|
||
"sections": sections,
|
||
"action_items": action_items,
|
||
"risk_items": risk_items,
|
||
"markdown": "\n".join(markdown_lines),
|
||
"schedule_payload": schedule_payload,
|
||
"style": style,
|
||
}
|
||
|
||
|
||
def _validate_source(source: Any, index: int) -> list[str]:
|
||
errors = []
|
||
if not isinstance(source, dict):
|
||
return [f"data.sources[{index}] must be an object"]
|
||
if not _clean_text(str(source.get("content", ""))):
|
||
errors.append(f"data.sources[{index}].content is required")
|
||
return errors
|
||
|
||
|
||
def validate_payload(payload: dict[str, Any]) -> list[str]:
|
||
errors = []
|
||
data = payload.get("data")
|
||
if not isinstance(data, dict):
|
||
return ["data must be an object"]
|
||
sources = data.get("sources")
|
||
if not isinstance(sources, list) or not sources:
|
||
errors.append("data.sources must be a non-empty array")
|
||
else:
|
||
for index, source in enumerate(sources):
|
||
errors.extend(_validate_source(source, index))
|
||
objective = data.get("objective")
|
||
if not isinstance(objective, str) or not objective.strip():
|
||
errors.append("data.objective is required")
|
||
style = data.get("style")
|
||
if style is not None and style not in VALID_STYLES:
|
||
errors.append(f"data.style must be one of {sorted(VALID_STYLES)}")
|
||
length = data.get("length")
|
||
if length is not None and length not in VALID_LENGTHS:
|
||
errors.append(f"data.length must be one of {sorted(VALID_LENGTHS)}")
|
||
for field in BOOL_FIELDS:
|
||
value = data.get(field)
|
||
if value is not None and not isinstance(value, bool):
|
||
errors.append(f"data.{field} must be a boolean")
|
||
return errors
|