zsglpt/services/slow_sql_metrics.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
慢 SQL 指标（轻量内存版）
- 记录超过阈值的 SQL 执行样本
- 维护近窗口期（默认24小时）聚合统计
- 输出 TOP SQL 与最近慢 SQL 列表
"""

from __future__ import annotations

import os
import threading
import time
from collections import deque

_SLOW_SQL_THRESHOLD_MS = max(0.0, float(os.environ.get("DB_SLOW_QUERY_MS", "120") or 120))
_WINDOW_SECONDS = max(600, int(os.environ.get("DB_SLOW_SQL_WINDOW_SECONDS", "86400") or 86400))
_TOP_LIMIT = max(5, int(os.environ.get("DB_SLOW_SQL_TOP_LIMIT", "12") or 12))
_RECENT_LIMIT = max(10, int(os.environ.get("DB_SLOW_SQL_RECENT_LIMIT", "50") or 50))
_MAX_EVENTS = max(_RECENT_LIMIT, int(os.environ.get("DB_SLOW_SQL_MAX_EVENTS", "20000") or 20000))
_SQL_MAX_LEN = max(80, int(os.environ.get("DB_SLOW_QUERY_SQL_MAX_LEN", "240") or 240))

_runtime_lock = threading.Lock()
_runtime_threshold_ms = _SLOW_SQL_THRESHOLD_MS
_runtime_sql_max_len = _SQL_MAX_LEN

_lock = threading.Lock()

_state = {
    "start_ts": time.time(),
    "last_slow_ts": 0.0,
    "events": deque(),
    "recent": deque(maxlen=_RECENT_LIMIT),
}


def _compact_text(value: str, max_len: int) -> str:
    text = " ".join(str(value or "").split())
    if len(text) <= max_len:
        return text
    return f"{text[: max_len - 3]}..."


def _get_runtime_values() -> tuple[float, int]:
    with _runtime_lock:
        return float(_runtime_threshold_ms), int(_runtime_sql_max_len)


def configure_slow_sql_runtime(*, threshold_ms=None, sql_max_len=None) -> dict:
    global _runtime_threshold_ms, _runtime_sql_max_len

    with _runtime_lock:
        if threshold_ms is not None:
            _runtime_threshold_ms = max(0.0, float(threshold_ms))
        if sql_max_len is not None:
            _runtime_sql_max_len = max(80, int(sql_max_len))

        return {
            "threshold_ms": float(_runtime_threshold_ms),
            "sql_max_len": int(_runtime_sql_max_len),
        }


def _compact_sql(sql: str) -> str:
    _, sql_max_len = _get_runtime_values()
    return _compact_text(str(sql or ""), sql_max_len)


def _compact_params(params_info: str) -> str:
    return _compact_text(str(params_info or "none"), 64)


def _prune_events_locked(now_ts: float) -> None:
    cutoff_ts = now_ts - float(_WINDOW_SECONDS)
    events = _state["events"]

    while events and float(events[0].get("time", 0.0) or 0.0) < cutoff_ts:
        events.popleft()

    overflow = len(events) - int(_MAX_EVENTS)
    while overflow > 0 and events:
        events.popleft()
        overflow -= 1


def record_slow_sql(*, sql: str, duration_ms: float, params_info: str = "none") -> None:
    duration = max(0.0, float(duration_ms or 0.0))
    now = time.time()
    sql_text = _compact_sql(sql)
    params_text = _compact_params(params_info)

    event = {
        "time": now,
        "sql": sql_text,
        "duration_ms": round(duration, 2),
        "params": params_text,
    }

    with _lock:
        _prune_events_locked(now)
        _state["events"].append(event)
        _state["recent"].append(event)
        _state["last_slow_ts"] = now


def get_slow_sql_metrics_snapshot() -> dict:
    now = time.time()

    with _lock:
        _prune_events_locked(now)

        events = list(_state["events"])
        recent_rows = list(_state["recent"])
        last_slow_ts = float(_state.get("last_slow_ts") or 0.0)

    grouped: dict[str, dict] = {}
    total_duration_ms = 0.0
    max_duration_ms = 0.0

    for item in events:
        sql_text = str(item.get("sql") or "-")
        duration = float(item.get("duration_ms") or 0.0)
        ts = float(item.get("time") or 0.0)
        params_text = str(item.get("params") or "none")

        total_duration_ms += duration
        if duration > max_duration_ms:
            max_duration_ms = duration

        bucket = grouped.get(sql_text)
        if bucket is None:
            bucket = {
                "sql": sql_text,
                "count": 0,
                "total_ms": 0.0,
                "max_ms": 0.0,
                "last_ts": 0.0,
                "params": params_text,
            }
            grouped[sql_text] = bucket

        bucket["count"] = int(bucket["count"] or 0) + 1
        bucket["total_ms"] = float(bucket["total_ms"] or 0.0) + duration
        if duration > float(bucket["max_ms"] or 0.0):
            bucket["max_ms"] = duration
            bucket["params"] = params_text
        if ts >= float(bucket["last_ts"] or 0.0):
            bucket["last_ts"] = ts

    top_sql_rows = sorted(
        grouped.values(),
        key=lambda row: (
            int(row.get("count", 0) or 0),
            float(row.get("max_ms", 0.0) or 0.0),
            float(row.get("total_ms", 0.0) or 0.0),
        ),
        reverse=True,
    )[:_TOP_LIMIT]

    top_sql = []
    for idx, row in enumerate(top_sql_rows, start=1):
        count = int(row.get("count", 0) or 0)
        total_ms = float(row.get("total_ms", 0.0) or 0.0)
        avg_ms = (total_ms / count) if count > 0 else 0.0
        top_sql.append(
            {
                "rank": idx,
                "sql": row.get("sql") or "-",
                "count": count,
                "avg_ms": round(avg_ms, 2),
                "max_ms": round(float(row.get("max_ms", 0.0) or 0.0), 2),
                "last_ts": int(float(row.get("last_ts", 0.0) or 0.0)),
                "sample_params": row.get("params") or "none",
            }
        )

    cutoff_ts = now - float(_WINDOW_SECONDS)
    recent = [
        {
            "time": int(float(item.get("time") or 0.0)),
            "sql": str(item.get("sql") or "-"),
            "duration_ms": round(float(item.get("duration_ms") or 0.0), 2),
            "params": str(item.get("params") or "none"),
        }
        for item in recent_rows
        if float(item.get("time") or 0.0) >= cutoff_ts
    ]

    total_events = len(events)
    avg_duration_ms = round((total_duration_ms / total_events), 2) if total_events > 0 else 0.0

    runtime_threshold_ms, _ = _get_runtime_values()

    return {
        "since_ts": int(float(events[0].get("time") or 0.0)) if events else 0,
        "window_seconds": _WINDOW_SECONDS,
        "top_limit": _TOP_LIMIT,
        "recent_limit": _RECENT_LIMIT,
        "slow_threshold_ms": runtime_threshold_ms,
        "total_slow_queries": total_events,
        "unique_sql": len(grouped),
        "avg_duration_ms": avg_duration_ms,
        "max_duration_ms": round(max_duration_ms, 2),
        "last_slow_ts": int(last_slow_ts) if last_slow_ts > 0 else 0,
        "top_sql": top_sql,
        "recent_slow_sql": recent,
    }