refactor: optimize structure, stability and runtime performance
This commit is contained in:
60
scripts/HEALTH_MONITOR_README.md
Normal file
60
scripts/HEALTH_MONITOR_README.md
Normal file
@@ -0,0 +1,60 @@
|
||||
# 健康监控(邮件版)
|
||||
|
||||
本目录提供 `health_email_monitor.py`,通过调用 `/health` 接口并使用**容器内已有邮件配置**发告警邮件。
|
||||
|
||||
## 1) 快速试跑
|
||||
|
||||
```bash
|
||||
cd /root/zsglpt
|
||||
python3 scripts/health_email_monitor.py \
|
||||
--to 你的告警邮箱@example.com \
|
||||
--container knowledge-automation-multiuser \
|
||||
--url http://127.0.0.1:51232/health \
|
||||
--dry-run
|
||||
```
|
||||
|
||||
去掉 `--dry-run` 即会实际发邮件。
|
||||
|
||||
## 2) 建议 cron(每分钟)
|
||||
|
||||
```bash
|
||||
* * * * * cd /root/zsglpt && /usr/bin/python3 scripts/health_email_monitor.py \
|
||||
--to 你的告警邮箱@example.com \
|
||||
--container knowledge-automation-multiuser \
|
||||
--url http://127.0.0.1:51232/health \
|
||||
>> /root/zsglpt/logs/health_monitor.log 2>&1
|
||||
```
|
||||
|
||||
## 3) 支持的规则
|
||||
|
||||
- `service_down`:健康接口请求失败(立即告警)
|
||||
- `health_fail`:返回 `ok/db_ok` 异常或 HTTP 5xx(立即告警)
|
||||
- `db_pool_exhausted`:连接池耗尽(默认连续 3 次才告警)
|
||||
- `queue_backlog_high`:任务堆积过高(默认 `pending_total >= 50` 且连续 5 次)
|
||||
|
||||
脚本支持恢复通知(规则恢复正常会发“恢复”邮件)。
|
||||
|
||||
## 4) 常用参数
|
||||
|
||||
- `--to`:收件人(必填)
|
||||
- `--container`:Docker 容器名(默认 `knowledge-automation-multiuser`)
|
||||
- `--url`:健康地址(默认 `http://127.0.0.1:51232/health`)
|
||||
- `--state-file`:状态文件路径(默认 `/tmp/zsglpt_health_monitor_state.json`)
|
||||
- `--remind-seconds`:重复告警间隔(默认 3600 秒)
|
||||
- `--queue-threshold`:队列告警阈值(默认 50)
|
||||
- `--queue-streak`:队列连续次数阈值(默认 5)
|
||||
- `--db-pool-streak`:连接池连续次数阈值(默认 3)
|
||||
|
||||
## 5) 环境变量方式(可选)
|
||||
|
||||
也可不用命令行参数,改用环境变量:
|
||||
|
||||
- `MONITOR_EMAIL_TO`
|
||||
- `MONITOR_DOCKER_CONTAINER`
|
||||
- `HEALTH_URL`
|
||||
- `MONITOR_STATE_FILE`
|
||||
- `MONITOR_REMIND_SECONDS`
|
||||
- `MONITOR_QUEUE_THRESHOLD`
|
||||
- `MONITOR_QUEUE_STREAK`
|
||||
- `MONITOR_DB_POOL_STREAK`
|
||||
|
||||
348
scripts/health_email_monitor.py
Normal file
348
scripts/health_email_monitor.py
Normal file
@@ -0,0 +1,348 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, Tuple
|
||||
from urllib.error import HTTPError, URLError
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
DEFAULT_STATE_FILE = "/tmp/zsglpt_health_monitor_state.json"
|
||||
|
||||
|
||||
def _now_text() -> str:
|
||||
return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
|
||||
def _safe_int(value: Any, default: int = 0) -> int:
|
||||
try:
|
||||
return int(value)
|
||||
except Exception:
|
||||
return int(default)
|
||||
|
||||
|
||||
def _safe_float(value: Any, default: float = 0.0) -> float:
|
||||
try:
|
||||
return float(value)
|
||||
except Exception:
|
||||
return float(default)
|
||||
|
||||
|
||||
def _load_state(path: str) -> Dict[str, Any]:
|
||||
if not path or not os.path.exists(path):
|
||||
return {
|
||||
"version": 1,
|
||||
"rules": {},
|
||||
"counters": {},
|
||||
}
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
raw = json.load(f)
|
||||
if not isinstance(raw, dict):
|
||||
raise ValueError("state is not dict")
|
||||
raw.setdefault("version", 1)
|
||||
raw.setdefault("rules", {})
|
||||
raw.setdefault("counters", {})
|
||||
return raw
|
||||
except Exception:
|
||||
return {
|
||||
"version": 1,
|
||||
"rules": {},
|
||||
"counters": {},
|
||||
}
|
||||
|
||||
|
||||
def _save_state(path: str, state: Dict[str, Any]) -> None:
|
||||
if not path:
|
||||
return
|
||||
state_dir = os.path.dirname(path)
|
||||
if state_dir:
|
||||
os.makedirs(state_dir, exist_ok=True)
|
||||
tmp_path = f"{path}.tmp"
|
||||
with open(tmp_path, "w", encoding="utf-8") as f:
|
||||
json.dump(state, f, ensure_ascii=False, indent=2)
|
||||
os.replace(tmp_path, path)
|
||||
|
||||
|
||||
def _fetch_health(url: str, timeout: int) -> Tuple[int | None, Dict[str, Any], str | None]:
|
||||
req = Request(
|
||||
url,
|
||||
headers={
|
||||
"User-Agent": "zsglpt-health-email-monitor/1.0",
|
||||
"Accept": "application/json",
|
||||
},
|
||||
method="GET",
|
||||
)
|
||||
try:
|
||||
with urlopen(req, timeout=max(1, int(timeout))) as resp:
|
||||
status = int(resp.getcode())
|
||||
body = resp.read().decode("utf-8", errors="ignore")
|
||||
except HTTPError as e:
|
||||
status = int(getattr(e, "code", 0) or 0)
|
||||
body = ""
|
||||
try:
|
||||
body = e.read().decode("utf-8", errors="ignore")
|
||||
except Exception:
|
||||
pass
|
||||
data = {}
|
||||
if body:
|
||||
try:
|
||||
data = json.loads(body)
|
||||
if not isinstance(data, dict):
|
||||
data = {}
|
||||
except Exception:
|
||||
data = {}
|
||||
return status, data, f"HTTPError: {e}"
|
||||
except URLError as e:
|
||||
return None, {}, f"URLError: {e}"
|
||||
except Exception as e:
|
||||
return None, {}, f"RequestError: {e}"
|
||||
|
||||
data: Dict[str, Any] = {}
|
||||
if body:
|
||||
try:
|
||||
loaded = json.loads(body)
|
||||
if isinstance(loaded, dict):
|
||||
data = loaded
|
||||
except Exception:
|
||||
data = {}
|
||||
|
||||
return status, data, None
|
||||
|
||||
|
||||
def _inc_streak(state: Dict[str, Any], key: str, bad: bool) -> int:
|
||||
counters = state.setdefault("counters", {})
|
||||
current = _safe_int(counters.get(key), 0)
|
||||
current = (current + 1) if bad else 0
|
||||
counters[key] = current
|
||||
return current
|
||||
|
||||
|
||||
def _rule_transition(
|
||||
state: Dict[str, Any],
|
||||
*,
|
||||
rule_name: str,
|
||||
bad: bool,
|
||||
streak: int,
|
||||
threshold: int,
|
||||
remind_seconds: int,
|
||||
now_ts: float,
|
||||
) -> str | None:
|
||||
rules = state.setdefault("rules", {})
|
||||
rule_state = rules.setdefault(rule_name, {"active": False, "last_sent": 0})
|
||||
|
||||
is_active = bool(rule_state.get("active", False))
|
||||
last_sent = _safe_float(rule_state.get("last_sent", 0), 0.0)
|
||||
threshold = max(1, int(threshold))
|
||||
remind_seconds = max(60, int(remind_seconds))
|
||||
|
||||
if bad and streak >= threshold:
|
||||
if not is_active:
|
||||
rule_state["active"] = True
|
||||
rule_state["last_sent"] = now_ts
|
||||
return "alert"
|
||||
if (now_ts - last_sent) >= remind_seconds:
|
||||
rule_state["last_sent"] = now_ts
|
||||
return "alert"
|
||||
return None
|
||||
|
||||
if is_active and (not bad):
|
||||
rule_state["active"] = False
|
||||
rule_state["last_sent"] = now_ts
|
||||
return "recover"
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _send_email_via_container(
|
||||
*,
|
||||
container_name: str,
|
||||
to_email: str,
|
||||
subject: str,
|
||||
body: str,
|
||||
timeout_seconds: int = 45,
|
||||
) -> Tuple[bool, str]:
|
||||
code = (
|
||||
"import sys,email_service;"
|
||||
"res=email_service.send_email(to_email=sys.argv[1],subject=sys.argv[2],body=sys.argv[3],email_type='health_monitor');"
|
||||
"ok=bool(res.get('success'));"
|
||||
"print('ok' if ok else ('error:'+str(res.get('error'))));"
|
||||
"raise SystemExit(0 if ok else 2)"
|
||||
)
|
||||
cmd = [
|
||||
"docker",
|
||||
"exec",
|
||||
container_name,
|
||||
"python",
|
||||
"-c",
|
||||
code,
|
||||
to_email,
|
||||
subject,
|
||||
body,
|
||||
]
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=max(5, int(timeout_seconds)),
|
||||
check=False,
|
||||
)
|
||||
except Exception as e:
|
||||
return False, str(e)
|
||||
|
||||
output = (proc.stdout or "") + (proc.stderr or "")
|
||||
output = output.strip()
|
||||
return proc.returncode == 0, output
|
||||
|
||||
|
||||
def _build_common_lines(status: int | None, data: Dict[str, Any], fetch_error: str | None) -> list[str]:
|
||||
metrics = data.get("metrics") if isinstance(data.get("metrics"), dict) else {}
|
||||
db_pool = metrics.get("db_pool") if isinstance(metrics.get("db_pool"), dict) else {}
|
||||
process = metrics.get("process") if isinstance(metrics.get("process"), dict) else {}
|
||||
task_queue = metrics.get("task_queue") if isinstance(metrics.get("task_queue"), dict) else {}
|
||||
|
||||
lines = [
|
||||
f"时间: {_now_text()}",
|
||||
f"健康地址: {data.get('_monitor_url', '')}",
|
||||
f"HTTP状态: {status if status is not None else '请求失败'}",
|
||||
f"ok/db_ok: {data.get('ok')} / {data.get('db_ok')}",
|
||||
]
|
||||
if fetch_error:
|
||||
lines.append(f"请求错误: {fetch_error}")
|
||||
lines.extend(
|
||||
[
|
||||
f"队列: pending={task_queue.get('pending_total', 'N/A')}, running={task_queue.get('running_total', 'N/A')}",
|
||||
f"连接池: size={db_pool.get('pool_size', 'N/A')}, available={db_pool.get('available', 'N/A')}, in_use={db_pool.get('in_use', 'N/A')}",
|
||||
f"进程: rss_mb={process.get('rss_mb', 'N/A')}, cpu%={process.get('cpu_percent', 'N/A')}, threads={process.get('threads', 'N/A')}",
|
||||
f"运行时长: {metrics.get('uptime_seconds', 'N/A')} 秒",
|
||||
]
|
||||
)
|
||||
return lines
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="zsglpt 邮件健康监控(基于 /health)")
|
||||
parser.add_argument("--url", default=os.environ.get("HEALTH_URL", "http://127.0.0.1:51232/health"))
|
||||
parser.add_argument("--to", default=os.environ.get("MONITOR_EMAIL_TO", ""))
|
||||
parser.add_argument(
|
||||
"--container",
|
||||
default=os.environ.get("MONITOR_DOCKER_CONTAINER", "knowledge-automation-multiuser"),
|
||||
)
|
||||
parser.add_argument("--state-file", default=os.environ.get("MONITOR_STATE_FILE", DEFAULT_STATE_FILE))
|
||||
parser.add_argument("--timeout", type=int, default=_safe_int(os.environ.get("MONITOR_TIMEOUT", 8), 8))
|
||||
parser.add_argument(
|
||||
"--remind-seconds",
|
||||
type=int,
|
||||
default=_safe_int(os.environ.get("MONITOR_REMIND_SECONDS", 3600), 3600),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--queue-threshold",
|
||||
type=int,
|
||||
default=_safe_int(os.environ.get("MONITOR_QUEUE_THRESHOLD", 50), 50),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--queue-streak",
|
||||
type=int,
|
||||
default=_safe_int(os.environ.get("MONITOR_QUEUE_STREAK", 5), 5),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--db-pool-streak",
|
||||
type=int,
|
||||
default=_safe_int(os.environ.get("MONITOR_DB_POOL_STREAK", 3), 3),
|
||||
)
|
||||
parser.add_argument("--dry-run", action="store_true", help="仅打印,不实际发邮件")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.to:
|
||||
print("[monitor] 缺少收件人,请设置 --to 或 MONITOR_EMAIL_TO", flush=True)
|
||||
return 2
|
||||
|
||||
state = _load_state(args.state_file)
|
||||
now_ts = time.time()
|
||||
|
||||
status, data, fetch_error = _fetch_health(args.url, args.timeout)
|
||||
if not isinstance(data, dict):
|
||||
data = {}
|
||||
data["_monitor_url"] = args.url
|
||||
|
||||
metrics = data.get("metrics") if isinstance(data.get("metrics"), dict) else {}
|
||||
db_pool = metrics.get("db_pool") if isinstance(metrics.get("db_pool"), dict) else {}
|
||||
task_queue = metrics.get("task_queue") if isinstance(metrics.get("task_queue"), dict) else {}
|
||||
|
||||
service_down = status is None
|
||||
health_fail = bool(status is not None and (status >= 500 or (not data.get("ok", False)) or (not data.get("db_ok", False))))
|
||||
db_pool_exhausted = (
|
||||
_safe_int(db_pool.get("pool_size"), 0) > 0
|
||||
and _safe_int(db_pool.get("available"), 0) <= 0
|
||||
and _safe_int(db_pool.get("in_use"), 0) >= _safe_int(db_pool.get("pool_size"), 0)
|
||||
)
|
||||
queue_backlog_high = _safe_int(task_queue.get("pending_total"), 0) >= max(1, int(args.queue_threshold))
|
||||
|
||||
rule_defs = [
|
||||
("service_down", service_down, 1),
|
||||
("health_fail", health_fail, 1),
|
||||
("db_pool_exhausted", db_pool_exhausted, max(1, int(args.db_pool_streak))),
|
||||
("queue_backlog_high", queue_backlog_high, max(1, int(args.queue_streak))),
|
||||
]
|
||||
|
||||
pending_notifications: list[tuple[str, str]] = []
|
||||
for rule_name, bad, threshold in rule_defs:
|
||||
streak = _inc_streak(state, rule_name, bad)
|
||||
action = _rule_transition(
|
||||
state,
|
||||
rule_name=rule_name,
|
||||
bad=bad,
|
||||
streak=streak,
|
||||
threshold=threshold,
|
||||
remind_seconds=args.remind_seconds,
|
||||
now_ts=now_ts,
|
||||
)
|
||||
if action:
|
||||
pending_notifications.append((rule_name, action))
|
||||
|
||||
_save_state(args.state_file, state)
|
||||
|
||||
if not pending_notifications:
|
||||
print(f"[monitor] {_now_text()} 正常,无需发送邮件")
|
||||
return 0
|
||||
|
||||
common_lines = _build_common_lines(status, data, fetch_error)
|
||||
|
||||
for rule_name, action in pending_notifications:
|
||||
level = "告警" if action == "alert" else "恢复"
|
||||
subject = f"[zsglpt健康监控][{level}] {rule_name}"
|
||||
body_lines = [
|
||||
f"规则: {rule_name}",
|
||||
f"状态: {level}",
|
||||
"",
|
||||
*common_lines,
|
||||
]
|
||||
body = "\n".join(body_lines)
|
||||
|
||||
if args.dry_run:
|
||||
print(f"[monitor][dry-run] subject={subject}\n{body}\n")
|
||||
continue
|
||||
|
||||
ok, msg = _send_email_via_container(
|
||||
container_name=args.container,
|
||||
to_email=args.to,
|
||||
subject=subject,
|
||||
body=body,
|
||||
)
|
||||
if ok:
|
||||
print(f"[monitor] 邮件已发送: {subject}")
|
||||
else:
|
||||
print(f"[monitor] 邮件发送失败: {subject} | {msg}")
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
|
||||
Reference in New Issue
Block a user