refactor: optimize structure, stability and runtime performance

This commit is contained in:
2026-02-07 00:35:11 +08:00
parent fae21329d7
commit bf29ac1924
44 changed files with 6894 additions and 4792 deletions

View File

@@ -0,0 +1,60 @@
# 健康监控(邮件版)
本目录提供 `health_email_monitor.py`,通过调用 `/health` 接口并使用**容器内已有邮件配置**发告警邮件。
## 1) 快速试跑
```bash
cd /root/zsglpt
python3 scripts/health_email_monitor.py \
--to 你的告警邮箱@example.com \
--container knowledge-automation-multiuser \
--url http://127.0.0.1:51232/health \
--dry-run
```
去掉 `--dry-run` 即会实际发邮件。
## 2) 建议 cron每分钟
```bash
* * * * * cd /root/zsglpt && /usr/bin/python3 scripts/health_email_monitor.py \
--to 你的告警邮箱@example.com \
--container knowledge-automation-multiuser \
--url http://127.0.0.1:51232/health \
>> /root/zsglpt/logs/health_monitor.log 2>&1
```
## 3) 支持的规则
- `service_down`:健康接口请求失败(立即告警)
- `health_fail`:返回 `ok/db_ok` 异常或 HTTP 5xx立即告警
- `db_pool_exhausted`:连接池耗尽(默认连续 3 次才告警)
- `queue_backlog_high`:任务堆积过高(默认 `pending_total >= 50` 且连续 5 次)
脚本支持恢复通知(规则恢复正常会发“恢复”邮件)。
## 4) 常用参数
- `--to`:收件人(必填)
- `--container`Docker 容器名(默认 `knowledge-automation-multiuser`
- `--url`:健康地址(默认 `http://127.0.0.1:51232/health`
- `--state-file`:状态文件路径(默认 `/tmp/zsglpt_health_monitor_state.json`
- `--remind-seconds`:重复告警间隔(默认 3600 秒)
- `--queue-threshold`:队列告警阈值(默认 50
- `--queue-streak`:队列连续次数阈值(默认 5
- `--db-pool-streak`:连接池连续次数阈值(默认 3
## 5) 环境变量方式(可选)
也可不用命令行参数,改用环境变量:
- `MONITOR_EMAIL_TO`
- `MONITOR_DOCKER_CONTAINER`
- `HEALTH_URL`
- `MONITOR_STATE_FILE`
- `MONITOR_REMIND_SECONDS`
- `MONITOR_QUEUE_THRESHOLD`
- `MONITOR_QUEUE_STREAK`
- `MONITOR_DB_POOL_STREAK`

View File

@@ -0,0 +1,348 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import annotations
import argparse
import json
import os
import subprocess
import time
from datetime import datetime
from typing import Any, Dict, Tuple
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen
DEFAULT_STATE_FILE = "/tmp/zsglpt_health_monitor_state.json"
def _now_text() -> str:
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def _safe_int(value: Any, default: int = 0) -> int:
try:
return int(value)
except Exception:
return int(default)
def _safe_float(value: Any, default: float = 0.0) -> float:
try:
return float(value)
except Exception:
return float(default)
def _load_state(path: str) -> Dict[str, Any]:
if not path or not os.path.exists(path):
return {
"version": 1,
"rules": {},
"counters": {},
}
try:
with open(path, "r", encoding="utf-8") as f:
raw = json.load(f)
if not isinstance(raw, dict):
raise ValueError("state is not dict")
raw.setdefault("version", 1)
raw.setdefault("rules", {})
raw.setdefault("counters", {})
return raw
except Exception:
return {
"version": 1,
"rules": {},
"counters": {},
}
def _save_state(path: str, state: Dict[str, Any]) -> None:
if not path:
return
state_dir = os.path.dirname(path)
if state_dir:
os.makedirs(state_dir, exist_ok=True)
tmp_path = f"{path}.tmp"
with open(tmp_path, "w", encoding="utf-8") as f:
json.dump(state, f, ensure_ascii=False, indent=2)
os.replace(tmp_path, path)
def _fetch_health(url: str, timeout: int) -> Tuple[int | None, Dict[str, Any], str | None]:
req = Request(
url,
headers={
"User-Agent": "zsglpt-health-email-monitor/1.0",
"Accept": "application/json",
},
method="GET",
)
try:
with urlopen(req, timeout=max(1, int(timeout))) as resp:
status = int(resp.getcode())
body = resp.read().decode("utf-8", errors="ignore")
except HTTPError as e:
status = int(getattr(e, "code", 0) or 0)
body = ""
try:
body = e.read().decode("utf-8", errors="ignore")
except Exception:
pass
data = {}
if body:
try:
data = json.loads(body)
if not isinstance(data, dict):
data = {}
except Exception:
data = {}
return status, data, f"HTTPError: {e}"
except URLError as e:
return None, {}, f"URLError: {e}"
except Exception as e:
return None, {}, f"RequestError: {e}"
data: Dict[str, Any] = {}
if body:
try:
loaded = json.loads(body)
if isinstance(loaded, dict):
data = loaded
except Exception:
data = {}
return status, data, None
def _inc_streak(state: Dict[str, Any], key: str, bad: bool) -> int:
counters = state.setdefault("counters", {})
current = _safe_int(counters.get(key), 0)
current = (current + 1) if bad else 0
counters[key] = current
return current
def _rule_transition(
state: Dict[str, Any],
*,
rule_name: str,
bad: bool,
streak: int,
threshold: int,
remind_seconds: int,
now_ts: float,
) -> str | None:
rules = state.setdefault("rules", {})
rule_state = rules.setdefault(rule_name, {"active": False, "last_sent": 0})
is_active = bool(rule_state.get("active", False))
last_sent = _safe_float(rule_state.get("last_sent", 0), 0.0)
threshold = max(1, int(threshold))
remind_seconds = max(60, int(remind_seconds))
if bad and streak >= threshold:
if not is_active:
rule_state["active"] = True
rule_state["last_sent"] = now_ts
return "alert"
if (now_ts - last_sent) >= remind_seconds:
rule_state["last_sent"] = now_ts
return "alert"
return None
if is_active and (not bad):
rule_state["active"] = False
rule_state["last_sent"] = now_ts
return "recover"
return None
def _send_email_via_container(
*,
container_name: str,
to_email: str,
subject: str,
body: str,
timeout_seconds: int = 45,
) -> Tuple[bool, str]:
code = (
"import sys,email_service;"
"res=email_service.send_email(to_email=sys.argv[1],subject=sys.argv[2],body=sys.argv[3],email_type='health_monitor');"
"ok=bool(res.get('success'));"
"print('ok' if ok else ('error:'+str(res.get('error'))));"
"raise SystemExit(0 if ok else 2)"
)
cmd = [
"docker",
"exec",
container_name,
"python",
"-c",
code,
to_email,
subject,
body,
]
try:
proc = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=max(5, int(timeout_seconds)),
check=False,
)
except Exception as e:
return False, str(e)
output = (proc.stdout or "") + (proc.stderr or "")
output = output.strip()
return proc.returncode == 0, output
def _build_common_lines(status: int | None, data: Dict[str, Any], fetch_error: str | None) -> list[str]:
metrics = data.get("metrics") if isinstance(data.get("metrics"), dict) else {}
db_pool = metrics.get("db_pool") if isinstance(metrics.get("db_pool"), dict) else {}
process = metrics.get("process") if isinstance(metrics.get("process"), dict) else {}
task_queue = metrics.get("task_queue") if isinstance(metrics.get("task_queue"), dict) else {}
lines = [
f"时间: {_now_text()}",
f"健康地址: {data.get('_monitor_url', '')}",
f"HTTP状态: {status if status is not None else '请求失败'}",
f"ok/db_ok: {data.get('ok')} / {data.get('db_ok')}",
]
if fetch_error:
lines.append(f"请求错误: {fetch_error}")
lines.extend(
[
f"队列: pending={task_queue.get('pending_total', 'N/A')}, running={task_queue.get('running_total', 'N/A')}",
f"连接池: size={db_pool.get('pool_size', 'N/A')}, available={db_pool.get('available', 'N/A')}, in_use={db_pool.get('in_use', 'N/A')}",
f"进程: rss_mb={process.get('rss_mb', 'N/A')}, cpu%={process.get('cpu_percent', 'N/A')}, threads={process.get('threads', 'N/A')}",
f"运行时长: {metrics.get('uptime_seconds', 'N/A')}",
]
)
return lines
def main() -> int:
parser = argparse.ArgumentParser(description="zsglpt 邮件健康监控(基于 /health")
parser.add_argument("--url", default=os.environ.get("HEALTH_URL", "http://127.0.0.1:51232/health"))
parser.add_argument("--to", default=os.environ.get("MONITOR_EMAIL_TO", ""))
parser.add_argument(
"--container",
default=os.environ.get("MONITOR_DOCKER_CONTAINER", "knowledge-automation-multiuser"),
)
parser.add_argument("--state-file", default=os.environ.get("MONITOR_STATE_FILE", DEFAULT_STATE_FILE))
parser.add_argument("--timeout", type=int, default=_safe_int(os.environ.get("MONITOR_TIMEOUT", 8), 8))
parser.add_argument(
"--remind-seconds",
type=int,
default=_safe_int(os.environ.get("MONITOR_REMIND_SECONDS", 3600), 3600),
)
parser.add_argument(
"--queue-threshold",
type=int,
default=_safe_int(os.environ.get("MONITOR_QUEUE_THRESHOLD", 50), 50),
)
parser.add_argument(
"--queue-streak",
type=int,
default=_safe_int(os.environ.get("MONITOR_QUEUE_STREAK", 5), 5),
)
parser.add_argument(
"--db-pool-streak",
type=int,
default=_safe_int(os.environ.get("MONITOR_DB_POOL_STREAK", 3), 3),
)
parser.add_argument("--dry-run", action="store_true", help="仅打印,不实际发邮件")
args = parser.parse_args()
if not args.to:
print("[monitor] 缺少收件人,请设置 --to 或 MONITOR_EMAIL_TO", flush=True)
return 2
state = _load_state(args.state_file)
now_ts = time.time()
status, data, fetch_error = _fetch_health(args.url, args.timeout)
if not isinstance(data, dict):
data = {}
data["_monitor_url"] = args.url
metrics = data.get("metrics") if isinstance(data.get("metrics"), dict) else {}
db_pool = metrics.get("db_pool") if isinstance(metrics.get("db_pool"), dict) else {}
task_queue = metrics.get("task_queue") if isinstance(metrics.get("task_queue"), dict) else {}
service_down = status is None
health_fail = bool(status is not None and (status >= 500 or (not data.get("ok", False)) or (not data.get("db_ok", False))))
db_pool_exhausted = (
_safe_int(db_pool.get("pool_size"), 0) > 0
and _safe_int(db_pool.get("available"), 0) <= 0
and _safe_int(db_pool.get("in_use"), 0) >= _safe_int(db_pool.get("pool_size"), 0)
)
queue_backlog_high = _safe_int(task_queue.get("pending_total"), 0) >= max(1, int(args.queue_threshold))
rule_defs = [
("service_down", service_down, 1),
("health_fail", health_fail, 1),
("db_pool_exhausted", db_pool_exhausted, max(1, int(args.db_pool_streak))),
("queue_backlog_high", queue_backlog_high, max(1, int(args.queue_streak))),
]
pending_notifications: list[tuple[str, str]] = []
for rule_name, bad, threshold in rule_defs:
streak = _inc_streak(state, rule_name, bad)
action = _rule_transition(
state,
rule_name=rule_name,
bad=bad,
streak=streak,
threshold=threshold,
remind_seconds=args.remind_seconds,
now_ts=now_ts,
)
if action:
pending_notifications.append((rule_name, action))
_save_state(args.state_file, state)
if not pending_notifications:
print(f"[monitor] {_now_text()} 正常,无需发送邮件")
return 0
common_lines = _build_common_lines(status, data, fetch_error)
for rule_name, action in pending_notifications:
level = "告警" if action == "alert" else "恢复"
subject = f"[zsglpt健康监控][{level}] {rule_name}"
body_lines = [
f"规则: {rule_name}",
f"状态: {level}",
"",
*common_lines,
]
body = "\n".join(body_lines)
if args.dry_run:
print(f"[monitor][dry-run] subject={subject}\n{body}\n")
continue
ok, msg = _send_email_via_container(
container_name=args.container,
to_email=args.to,
subject=subject,
body=body,
)
if ok:
print(f"[monitor] 邮件已发送: {subject}")
else:
print(f"[monitor] 邮件发送失败: {subject} | {msg}")
return 0
if __name__ == "__main__":
raise SystemExit(main())