#!/usr/bin/env python3 """ 中国天气网天气预报查询工具 (weather.com.cn) 查询7天天气预报和生活指数数据,输出结构化数据供AI模型使用。 """ import re import sys import json import urllib.parse import urllib.request from html.parser import HTMLParser from datetime import datetime class WeatherHTMLParser(HTMLParser): """解析天气页面HTML,提取7天预报和生活指数数据。""" def __init__(self): super().__init__() self._capture = False self._target_id = None self._depth = 0 self._data_parts = [] self._result = "" def extract(self, html, target_id): """提取指定id的div内容。""" self._capture = False self._target_id = target_id self._depth = 0 self._data_parts = [] self._result = "" self.feed(html) return self._result def handle_starttag(self, tag, attrs): attrs_dict = dict(attrs) if tag == "div" and attrs_dict.get("id") == self._target_id: self._capture = True self._depth = 1 return if self._capture and tag == "div": self._depth += 1 if self._capture: attr_str = " ".join(f'{k}="{v}"' for k, v in attrs) self._data_parts.append(f"<{tag} {attr_str}>" if attr_str else f"<{tag}>") def handle_endtag(self, tag): if self._capture: self._data_parts.append(f"{tag}>") if tag == "div": self._depth -= 1 if self._depth <= 0: self._capture = False self._result = "".join(self._data_parts) def handle_data(self, data): if self._capture: self._data_parts.append(data) class ChinaWeather: """中国天气网天气查询。""" # 城市搜索API SEARCH_URL = "https://toy1.weather.com.cn/search?cityname={query}&callback=success_jsonpCallback&_={ts}" # 天气预报页面 WEATHER_URL = "https://www.weather.com.cn/weather/{code}.shtml" # 请求超时(秒) REQUEST_TIMEOUT = 15 # 请求头 REQUEST_HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36", "Accept": "*/*", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Referer": "https://www.weather.com.cn/", } # 预置常用城市代码 PRESET_CITIES = { "北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601", "成都": "101270101", "杭州": "101210101", "南京": "101190101", "武汉": "101200101", "西安": "101110101", "重庆": "101040100", "天津": "101030100", "沈阳": "101070101", "哈尔滨": "101050101", "长春": "101060101", "济南": "101120101", "青岛": "101120201", "郑州": "101180101", "长沙": "101250101", "南昌": "101240101", "福州": "101230101", "厦门": "101230201", "南宁": "101300101", "海口": "101310101", "三亚": "101310201", "贵阳": "101260101", "昆明": "101290101", "兰州": "101160101", "银川": "101170101", "西宁": "101150101", "拉萨": "101140101", "乌鲁木齐": "101130101", "石家庄": "101090101", "太原": "101100101", "呼和浩特": "101080101", "大连": "101070201", "苏州": "101190401", "无锡": "101190201", "宁波": "101210401", "温州": "101210701", "绍兴": "101210601", "金华": "101210901", "台州": "101211101", "嘉兴": "101210301", "湖州": "101210201", "衢州": "101211001", "丽水": "101211201", "舟山": "101211301", "东莞": "101281901", "佛山": "101281701", "珠海": "101280701", "中山": "101281801", "惠州": "101280301", "江门": "101280603", "肇庆": "101280901", "湛江": "101281001", "茂名": "101281101", "汕头": "101280401", "合肥": "101220101", "常州": "101191101", "徐州": "101190801", "烟台": "101120501", "潍坊": "101120601", "临沂": "101120901", "洛阳": "101180901", "襄阳": "101200201", "宜昌": "101200901", "芜湖": "101220301", "泉州": "101230501", "漳州": "101230601", "桂林": "101300501", "柳州": "101300301", "遵义": "101260201", "大理": "101290601", "丽江": "101291401", "西双版纳": "101291601", "延安": "101110600", "宝鸡": "101110901", "咸阳": "101110200", "香港": "101320101", "澳门": "101330101", } # 生活指数名称映射 LIFE_INDEX_NAMES = { "感冒": "cold", "运动": "exercise", "过敏": "allergy", "穿衣": "clothing", "洗车": "carwash", "紫外线": "uv", "钓鱼": "fishing", "旅游": "travel", "晾晒": "drying", "交通": "traffic", "防晒": "sunscreen", } def __init__(self): self._parser = WeatherHTMLParser() def _http_get(self, url, encoding="utf-8"): """发送HTTP GET请求。""" req = urllib.request.Request(url, headers=self.REQUEST_HEADERS) try: with urllib.request.urlopen(req, timeout=self.REQUEST_TIMEOUT) as resp: return resp.read().decode(encoding, errors="replace") except Exception as e: raise ConnectionError(f"HTTP请求失败: {url} - {e}") def search_city(self, city_name): """通过搜索API查询城市代码。 Returns: tuple: (城市代码, 城市显示名) 或 None """ encoded = urllib.parse.quote(city_name) ts = int(datetime.now().timestamp() * 1000) url = self.SEARCH_URL.format(query=encoded, ts=ts) try: text = self._http_get(url) except ConnectionError: return None # 解析JSONP: success_jsonpCallback([...]) match = re.search(r"success_jsonpCallback\((\[.*\])\)", text, re.DOTALL) if not match: return None try: results = json.loads(match.group(1)) except json.JSONDecodeError: return None if not results: return None # 取第一个结果,解析格式: "代码~省份~城市名~英文名~..." ref = results[0].get("ref", "") parts = ref.split("~") if len(parts) >= 3: code = parts[0] display_name = parts[2] # 确保是有效的城市代码(以101开头的9位数字) if re.match(r"^101\d{6}$", code): return (code, display_name) return None def get_city_code(self, city_name): """获取城市代码,优先使用预置数据。 Returns: tuple: (城市代码, 城市显示名) """ city_name = city_name.strip() # 优先查预置城市 if city_name in self.PRESET_CITIES: return (self.PRESET_CITIES[city_name], city_name) # 实时搜索 result = self.search_city(city_name) if result: return result raise ValueError(f"未找到城市: {city_name}") def fetch_weather_html(self, city_code): """获取天气预报页面HTML。""" url = self.WEATHER_URL.format(code=city_code) return self._http_get(url) def parse_7day_forecast(self, html): """解析7天天气预报数据。 从
天气
高温/低温℃
... 风力等级
""" block = self._parser.extract(html, "7d") if not block: return [] forecasts = [] # 按]*class="wea"', li) if m: day["weather"] = m.group(1).strip() else: m = re.search(r'class="wea"[^>]*>([^<]+)<', li) if m: day["weather"] = m.group(1).strip() # 温度 -
高温℃/低温℃
# 注意:晚间"今天"可能无高温,仅有低温℃ m = re.search(r'(\d+)℃?', li) if m: day["temp_high"] = m.group(1) + "℃" m = re.search(r'(\d+)℃', li) if m: day["temp_low"] = m.group(1) + "℃" # 风向 - 内第一个 m = re.search(r']*class="[A-Z]', li) if m: day["wind"] = m.group(1).strip() # 风力等级 -内紧跟
后的 等级 m = re.search(r'\s*([^<]+)', li) if m: day["wind_level"] = m.group(1).strip() forecasts.append(day) return forecasts def parse_life_indices(self, html): """解析生活指数数据。 从描述
描述内容
m_desc = re.search(r'([^<]+)
', li) if not (m_level and m_name): continue level = m_level.group(1).strip() name = m_name.group(1).strip() desc = m_desc.group(1).strip() if m_desc else "" # 提取指数类型关键词(去掉"指数"后缀) index_type = name.replace("指数", "").strip() key = self.LIFE_INDEX_NAMES.get(index_type, index_type) indices.append({ "name": name, "key": key, "level": level, "description": desc, }) return indices def query(self, city_name): """查询指定城市的天气预报。 Args: city_name: 城市名称,如"南京"、"北京" Returns: dict: 包含城市信息、7天预报(含每日生活指数)的结构化数据 """ code, display_name = self.get_city_code(city_name) html = self.fetch_weather_html(code) forecast = self.parse_7day_forecast(html) life_indices = self.parse_life_indices(html) # 将生活指数按天分组后合并到对应天的forecast中 # livezs中的指数是连续排列的,每天的指数类型数量相同 if forecast and life_indices: num_days = len(forecast) indices_per_day = len(life_indices) // num_days if num_days > 0 else 0 if indices_per_day > 0: for i, day in enumerate(forecast): start = i * indices_per_day end = start + indices_per_day day["life_indices"] = life_indices[start:end] else: # 无法均分时,全部放到第一天 forecast[0]["life_indices"] = life_indices return { "city": display_name, "city_code": code, "query_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "source": "weather.com.cn", "forecast": forecast, } def format_output(self, data): """格式化天气数据为文本输出(供AI模型读取)。""" if "error" in data: return f"错误: {data['error']}" lines = [ f"城市: {data['city']} (代码: {data['city_code']})", f"数据来源: {data['source']}", f"查询时间: {data['query_time']}", ] for day in data.get("forecast", []): lines.append("") temp = f"{day['temp_high']}/{day['temp_low']}" if day.get("temp_high") else day.get("temp_low", "") wind_info = f"{day['wind']} {day['wind_level']}" if day.get("wind") else "" lines.append(f"[{day['date']}] {day['weather']}, {temp}, {wind_info}".rstrip(", ")) # 每天的生活指数 for idx in day.get("life_indices", []): lines.append(f" {idx['name']}: {idx['level']} - {idx['description']}") return "\n".join(lines) def main(): if len(sys.argv) < 2: print("Usage: weather_cn.py