refactor: optimize structure, stability and runtime performance

2026-02-07 00:35:11 +08:00
parent fae21329d7
commit bf29ac1924
44 changed files with 6894 additions and 4792 deletions
--- a/scripts/HEALTH_MONITOR_README.md
+++ b/scripts/HEALTH_MONITOR_README.md
@@ -0,0 +1,60 @@
+# 健康监控（邮件版）
+
+本目录提供 `health_email_monitor.py`，通过调用 `/health` 接口并使用**容器内已有邮件配置**发告警邮件。
+
+## 1) 快速试跑
+
+```bash
+cd /root/zsglpt
+python3 scripts/health_email_monitor.py \
+  --to 你的告警邮箱@example.com \
+  --container knowledge-automation-multiuser \
+  --url http://127.0.0.1:51232/health \
+  --dry-run
+```
+
+去掉 `--dry-run` 即会实际发邮件。
+
+## 2) 建议 cron（每分钟）
+
+```bash
+* * * * * cd /root/zsglpt && /usr/bin/python3 scripts/health_email_monitor.py \
+  --to 你的告警邮箱@example.com \
+  --container knowledge-automation-multiuser \
+  --url http://127.0.0.1:51232/health \
+  >> /root/zsglpt/logs/health_monitor.log 2>&1
+```
+
+## 3) 支持的规则
+
+- `service_down`：健康接口请求失败（立即告警）
+- `health_fail`：返回 `ok/db_ok` 异常或 HTTP 5xx（立即告警）
+- `db_pool_exhausted`：连接池耗尽（默认连续 3 次才告警）
+- `queue_backlog_high`：任务堆积过高（默认 `pending_total >= 50` 且连续 5 次）
+
+脚本支持恢复通知（规则恢复正常会发“恢复”邮件）。
+
+## 4) 常用参数
+
+- `--to`：收件人（必填）
+- `--container`：Docker 容器名（默认 `knowledge-automation-multiuser`）
+- `--url`：健康地址（默认 `http://127.0.0.1:51232/health`）
+- `--state-file`：状态文件路径（默认 `/tmp/zsglpt_health_monitor_state.json`）
+- `--remind-seconds`：重复告警间隔（默认 3600 秒）
+- `--queue-threshold`：队列告警阈值（默认 50）
+- `--queue-streak`：队列连续次数阈值（默认 5）
+- `--db-pool-streak`：连接池连续次数阈值（默认 3）
+
+## 5) 环境变量方式（可选）
+
+也可不用命令行参数，改用环境变量：
+
+- `MONITOR_EMAIL_TO`
+- `MONITOR_DOCKER_CONTAINER`
+- `HEALTH_URL`
+- `MONITOR_STATE_FILE`
+- `MONITOR_REMIND_SECONDS`
+- `MONITOR_QUEUE_THRESHOLD`
+- `MONITOR_QUEUE_STREAK`
+- `MONITOR_DB_POOL_STREAK`
+
--- a/scripts/health_email_monitor.py
+++ b/scripts/health_email_monitor.py
@@ -0,0 +1,348 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import subprocess
+import time
+from datetime import datetime
+from typing import Any, Dict, Tuple
+from urllib.error import HTTPError, URLError
+from urllib.request import Request, urlopen
+
+DEFAULT_STATE_FILE = "/tmp/zsglpt_health_monitor_state.json"
+
+
+def _now_text() -> str:
+    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+
+def _safe_int(value: Any, default: int = 0) -> int:
+    try:
+        return int(value)
+    except Exception:
+        return int(default)
+
+
+def _safe_float(value: Any, default: float = 0.0) -> float:
+    try:
+        return float(value)
+    except Exception:
+        return float(default)
+
+
+def _load_state(path: str) -> Dict[str, Any]:
+    if not path or not os.path.exists(path):
+        return {
+            "version": 1,
+            "rules": {},
+            "counters": {},
+        }
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            raw = json.load(f)
+        if not isinstance(raw, dict):
+            raise ValueError("state is not dict")
+        raw.setdefault("version", 1)
+        raw.setdefault("rules", {})
+        raw.setdefault("counters", {})
+        return raw
+    except Exception:
+        return {
+            "version": 1,
+            "rules": {},
+            "counters": {},
+        }
+
+
+def _save_state(path: str, state: Dict[str, Any]) -> None:
+    if not path:
+        return
+    state_dir = os.path.dirname(path)
+    if state_dir:
+        os.makedirs(state_dir, exist_ok=True)
+    tmp_path = f"{path}.tmp"
+    with open(tmp_path, "w", encoding="utf-8") as f:
+        json.dump(state, f, ensure_ascii=False, indent=2)
+    os.replace(tmp_path, path)
+
+
+def _fetch_health(url: str, timeout: int) -> Tuple[int | None, Dict[str, Any], str | None]:
+    req = Request(
+        url,
+        headers={
+            "User-Agent": "zsglpt-health-email-monitor/1.0",
+            "Accept": "application/json",
+        },
+        method="GET",
+    )
+    try:
+        with urlopen(req, timeout=max(1, int(timeout))) as resp:
+            status = int(resp.getcode())
+            body = resp.read().decode("utf-8", errors="ignore")
+    except HTTPError as e:
+        status = int(getattr(e, "code", 0) or 0)
+        body = ""
+        try:
+            body = e.read().decode("utf-8", errors="ignore")
+        except Exception:
+            pass
+        data = {}
+        if body:
+            try:
+                data = json.loads(body)
+                if not isinstance(data, dict):
+                    data = {}
+            except Exception:
+                data = {}
+        return status, data, f"HTTPError: {e}"
+    except URLError as e:
+        return None, {}, f"URLError: {e}"
+    except Exception as e:
+        return None, {}, f"RequestError: {e}"
+
+    data: Dict[str, Any] = {}
+    if body:
+        try:
+            loaded = json.loads(body)
+            if isinstance(loaded, dict):
+                data = loaded
+        except Exception:
+            data = {}
+
+    return status, data, None
+
+
+def _inc_streak(state: Dict[str, Any], key: str, bad: bool) -> int:
+    counters = state.setdefault("counters", {})
+    current = _safe_int(counters.get(key), 0)
+    current = (current + 1) if bad else 0
+    counters[key] = current
+    return current
+
+
+def _rule_transition(
+    state: Dict[str, Any],
+    *,
+    rule_name: str,
+    bad: bool,
+    streak: int,
+    threshold: int,
+    remind_seconds: int,
+    now_ts: float,
+) -> str | None:
+    rules = state.setdefault("rules", {})
+    rule_state = rules.setdefault(rule_name, {"active": False, "last_sent": 0})
+
+    is_active = bool(rule_state.get("active", False))
+    last_sent = _safe_float(rule_state.get("last_sent", 0), 0.0)
+    threshold = max(1, int(threshold))
+    remind_seconds = max(60, int(remind_seconds))
+
+    if bad and streak >= threshold:
+        if not is_active:
+            rule_state["active"] = True
+            rule_state["last_sent"] = now_ts
+            return "alert"
+        if (now_ts - last_sent) >= remind_seconds:
+            rule_state["last_sent"] = now_ts
+            return "alert"
+        return None
+
+    if is_active and (not bad):
+        rule_state["active"] = False
+        rule_state["last_sent"] = now_ts
+        return "recover"
+
+    return None
+
+
+def _send_email_via_container(
+    *,
+    container_name: str,
+    to_email: str,
+    subject: str,
+    body: str,
+    timeout_seconds: int = 45,
+) -> Tuple[bool, str]:
+    code = (
+        "import sys,email_service;"
+        "res=email_service.send_email(to_email=sys.argv[1],subject=sys.argv[2],body=sys.argv[3],email_type='health_monitor');"
+        "ok=bool(res.get('success'));"
+        "print('ok' if ok else ('error:'+str(res.get('error'))));"
+        "raise SystemExit(0 if ok else 2)"
+    )
+    cmd = [
+        "docker",
+        "exec",
+        container_name,
+        "python",
+        "-c",
+        code,
+        to_email,
+        subject,
+        body,
+    ]
+    try:
+        proc = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=max(5, int(timeout_seconds)),
+            check=False,
+        )
+    except Exception as e:
+        return False, str(e)
+
+    output = (proc.stdout or "") + (proc.stderr or "")
+    output = output.strip()
+    return proc.returncode == 0, output
+
+
+def _build_common_lines(status: int | None, data: Dict[str, Any], fetch_error: str | None) -> list[str]:
+    metrics = data.get("metrics") if isinstance(data.get("metrics"), dict) else {}
+    db_pool = metrics.get("db_pool") if isinstance(metrics.get("db_pool"), dict) else {}
+    process = metrics.get("process") if isinstance(metrics.get("process"), dict) else {}
+    task_queue = metrics.get("task_queue") if isinstance(metrics.get("task_queue"), dict) else {}
+
+    lines = [
+        f"时间: {_now_text()}",
+        f"健康地址: {data.get('_monitor_url', '')}",
+        f"HTTP状态: {status if status is not None else '请求失败'}",
+        f"ok/db_ok: {data.get('ok')} / {data.get('db_ok')}",
+    ]
+    if fetch_error:
+        lines.append(f"请求错误: {fetch_error}")
+    lines.extend(
+        [
+            f"队列: pending={task_queue.get('pending_total', 'N/A')}, running={task_queue.get('running_total', 'N/A')}",
+            f"连接池: size={db_pool.get('pool_size', 'N/A')}, available={db_pool.get('available', 'N/A')}, in_use={db_pool.get('in_use', 'N/A')}",
+            f"进程: rss_mb={process.get('rss_mb', 'N/A')}, cpu%={process.get('cpu_percent', 'N/A')}, threads={process.get('threads', 'N/A')}",
+            f"运行时长: {metrics.get('uptime_seconds', 'N/A')} 秒",
+        ]
+    )
+    return lines
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="zsglpt 邮件健康监控（基于 /health）")
+    parser.add_argument("--url", default=os.environ.get("HEALTH_URL", "http://127.0.0.1:51232/health"))
+    parser.add_argument("--to", default=os.environ.get("MONITOR_EMAIL_TO", ""))
+    parser.add_argument(
+        "--container",
+        default=os.environ.get("MONITOR_DOCKER_CONTAINER", "knowledge-automation-multiuser"),
+    )
+    parser.add_argument("--state-file", default=os.environ.get("MONITOR_STATE_FILE", DEFAULT_STATE_FILE))
+    parser.add_argument("--timeout", type=int, default=_safe_int(os.environ.get("MONITOR_TIMEOUT", 8), 8))
+    parser.add_argument(
+        "--remind-seconds",
+        type=int,
+        default=_safe_int(os.environ.get("MONITOR_REMIND_SECONDS", 3600), 3600),
+    )
+    parser.add_argument(
+        "--queue-threshold",
+        type=int,
+        default=_safe_int(os.environ.get("MONITOR_QUEUE_THRESHOLD", 50), 50),
+    )
+    parser.add_argument(
+        "--queue-streak",
+        type=int,
+        default=_safe_int(os.environ.get("MONITOR_QUEUE_STREAK", 5), 5),
+    )
+    parser.add_argument(
+        "--db-pool-streak",
+        type=int,
+        default=_safe_int(os.environ.get("MONITOR_DB_POOL_STREAK", 3), 3),
+    )
+    parser.add_argument("--dry-run", action="store_true", help="仅打印，不实际发邮件")
+    args = parser.parse_args()
+
+    if not args.to:
+        print("[monitor] 缺少收件人，请设置 --to 或 MONITOR_EMAIL_TO", flush=True)
+        return 2
+
+    state = _load_state(args.state_file)
+    now_ts = time.time()
+
+    status, data, fetch_error = _fetch_health(args.url, args.timeout)
+    if not isinstance(data, dict):
+        data = {}
+    data["_monitor_url"] = args.url
+
+    metrics = data.get("metrics") if isinstance(data.get("metrics"), dict) else {}
+    db_pool = metrics.get("db_pool") if isinstance(metrics.get("db_pool"), dict) else {}
+    task_queue = metrics.get("task_queue") if isinstance(metrics.get("task_queue"), dict) else {}
+
+    service_down = status is None
+    health_fail = bool(status is not None and (status >= 500 or (not data.get("ok", False)) or (not data.get("db_ok", False))))
+    db_pool_exhausted = (
+        _safe_int(db_pool.get("pool_size"), 0) > 0
+        and _safe_int(db_pool.get("available"), 0) <= 0
+        and _safe_int(db_pool.get("in_use"), 0) >= _safe_int(db_pool.get("pool_size"), 0)
+    )
+    queue_backlog_high = _safe_int(task_queue.get("pending_total"), 0) >= max(1, int(args.queue_threshold))
+
+    rule_defs = [
+        ("service_down", service_down, 1),
+        ("health_fail", health_fail, 1),
+        ("db_pool_exhausted", db_pool_exhausted, max(1, int(args.db_pool_streak))),
+        ("queue_backlog_high", queue_backlog_high, max(1, int(args.queue_streak))),
+    ]
+
+    pending_notifications: list[tuple[str, str]] = []
+    for rule_name, bad, threshold in rule_defs:
+        streak = _inc_streak(state, rule_name, bad)
+        action = _rule_transition(
+            state,
+            rule_name=rule_name,
+            bad=bad,
+            streak=streak,
+            threshold=threshold,
+            remind_seconds=args.remind_seconds,
+            now_ts=now_ts,
+        )
+        if action:
+            pending_notifications.append((rule_name, action))
+
+    _save_state(args.state_file, state)
+
+    if not pending_notifications:
+        print(f"[monitor] {_now_text()} 正常，无需发送邮件")
+        return 0
+
+    common_lines = _build_common_lines(status, data, fetch_error)
+
+    for rule_name, action in pending_notifications:
+        level = "告警" if action == "alert" else "恢复"
+        subject = f"[zsglpt健康监控][{level}] {rule_name}"
+        body_lines = [
+            f"规则: {rule_name}",
+            f"状态: {level}",
+            "",
+            *common_lines,
+        ]
+        body = "\n".join(body_lines)
+
+        if args.dry_run:
+            print(f"[monitor][dry-run] subject={subject}\n{body}\n")
+            continue
+
+        ok, msg = _send_email_via_container(
+            container_name=args.container,
+            to_email=args.to,
+            subject=subject,
+            body=body,
+        )
+        if ok:
+            print(f"[monitor] 邮件已发送: {subject}")
+        else:
+            print(f"[monitor] 邮件发送失败: {subject} | {msg}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
+