refactor: optimize structure, stability and runtime performance

2026-02-07 00:35:11 +08:00
parent fae21329d7
commit bf29ac1924
44 changed files with 6894 additions and 4792 deletions
--- a/services/maintenance.py
+++ b/services/maintenance.py
@@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 from __future__ import annotations

+import os
 import threading
 import time
 from datetime import datetime
@@ -10,6 +11,8 @@ from app_config import get_config
 from app_logger import get_logger
 from services.state import (
    cleanup_expired_ip_rate_limits,
+    cleanup_expired_ip_request_rates,
+    cleanup_expired_login_security_state,
    safe_cleanup_expired_batches,
    safe_cleanup_expired_captcha,
    safe_cleanup_expired_pending_random,
@@ -31,6 +34,69 @@ PENDING_RANDOM_EXPIRE_SECONDS = int(getattr(config, "PENDING_RANDOM_EXPIRE_SECON
 _kdocs_offline_notified: bool = False


+def _to_int(value, default: int = 0) -> int:
+    try:
+        return int(value)
+    except Exception:
+        return int(default)
+
+
+def _collect_active_user_ids() -> set[int]:
+    active_user_ids: set[int] = set()
+    for _, info in safe_iter_task_status_items():
+        user_id = info.get("user_id") if isinstance(info, dict) else None
+        if user_id is None:
+            continue
+        try:
+            active_user_ids.add(int(user_id))
+        except Exception:
+            continue
+    return active_user_ids
+
+
+def _find_expired_user_cache_ids(current_time: float, active_user_ids: set[int]) -> list[int]:
+    expired_users = []
+    for user_id, last_access in (safe_get_user_accounts_last_access_items() or []):
+        try:
+            user_id_int = int(user_id)
+            last_access_ts = float(last_access)
+        except Exception:
+            continue
+        if (current_time - last_access_ts) <= USER_ACCOUNTS_EXPIRE_SECONDS:
+            continue
+        if user_id_int in active_user_ids:
+            continue
+        if safe_has_user(user_id_int):
+            expired_users.append(user_id_int)
+    return expired_users
+
+
+def _find_completed_task_status_ids(current_time: float) -> list[str]:
+    completed_task_ids = []
+    for account_id, status_data in safe_iter_task_status_items():
+        status = status_data.get("status") if isinstance(status_data, dict) else None
+        if status not in ["已完成", "失败", "已停止"]:
+            continue
+
+        start_time = float(status_data.get("start_time", 0) or 0)
+        if (current_time - start_time) > 600:  # 10分钟
+            completed_task_ids.append(account_id)
+    return completed_task_ids
+
+
+def _reap_zombie_processes() -> None:
+    while True:
+        try:
+            pid, _ = os.waitpid(-1, os.WNOHANG)
+            if pid == 0:
+                break
+            logger.debug(f"已回收僵尸进程: PID={pid}")
+        except ChildProcessError:
+            break
+        except Exception:
+            break
+
+
 def cleanup_expired_data() -> None:
    """定期清理过期数据，防止内存泄漏（逻辑保持不变）。"""
    current_time = time.time()
@@ -43,48 +109,36 @@ def cleanup_expired_data() -> None:
    if deleted_ips:
        logger.debug(f"已清理 {deleted_ips} 个过期IP限流记录")

-    expired_users = []
-    last_access_items = safe_get_user_accounts_last_access_items()
-    if last_access_items:
-        task_items = safe_iter_task_status_items()
-        active_user_ids = {int(info.get("user_id")) for _, info in task_items if info.get("user_id")}
-        for user_id, last_access in last_access_items:
-            if (current_time - float(last_access)) <= USER_ACCOUNTS_EXPIRE_SECONDS:
-                continue
-            if int(user_id) in active_user_ids:
-                continue
-            if safe_has_user(user_id):
-                expired_users.append(int(user_id))
+    deleted_ip_requests = cleanup_expired_ip_request_rates(current_time)
+    if deleted_ip_requests:
+        logger.debug(f"已清理 {deleted_ip_requests} 个过期IP请求频率记录")

+    login_cleanup_stats = cleanup_expired_login_security_state(current_time)
+    login_cleanup_total = sum(int(v or 0) for v in login_cleanup_stats.values())
+    if login_cleanup_total:
+        logger.debug(
+            "已清理登录风控缓存: "
+            f"失败计数={login_cleanup_stats.get('failures', 0)}, "
+            f"限流桶={login_cleanup_stats.get('rate_limits', 0)}, "
+            f"扫描状态={login_cleanup_stats.get('scan_states', 0)}, "
+            f"短时锁={login_cleanup_stats.get('ip_user_locks', 0)}, "
+            f"告警状态={login_cleanup_stats.get('alerts', 0)}"
+        )
+
+    active_user_ids = _collect_active_user_ids()
+    expired_users = _find_expired_user_cache_ids(current_time, active_user_ids)
    for user_id in expired_users:
        safe_remove_user_accounts(user_id)
    if expired_users:
        logger.debug(f"已清理 {len(expired_users)} 个过期用户账号缓存")

-    completed_tasks = []
-    for account_id, status_data in safe_iter_task_status_items():
-        if status_data.get("status") in ["已完成", "失败", "已停止"]:
-            start_time = float(status_data.get("start_time", 0) or 0)
-            if (current_time - start_time) > 600:  # 10分钟
-                completed_tasks.append(account_id)
-    for account_id in completed_tasks:
+    completed_task_ids = _find_completed_task_status_ids(current_time)
+    for account_id in completed_task_ids:
        safe_remove_task_status(account_id)
-    if completed_tasks:
-        logger.debug(f"已清理 {len(completed_tasks)} 个已完成任务状态")
+    if completed_task_ids:
+        logger.debug(f"已清理 {len(completed_task_ids)} 个已完成任务状态")

-    try:
-        import os
-
-        while True:
-            try:
-                pid, status = os.waitpid(-1, os.WNOHANG)
-                if pid == 0:
-                    break
-                logger.debug(f"已回收僵尸进程: PID={pid}")
-            except ChildProcessError:
-                break
-    except Exception:
-        pass
+    _reap_zombie_processes()

    deleted_batches = safe_cleanup_expired_batches(BATCH_TASK_EXPIRE_SECONDS, current_time)
    if deleted_batches:
@@ -95,52 +149,39 @@ def cleanup_expired_data() -> None:
        logger.debug(f"已清理 {deleted_random} 个过期随机延迟任务")


-def check_kdocs_online_status() -> None:
-    """检测金山文档登录状态，如果离线则发送邮件通知管理员（每次掉线只通知一次）"""
-    global _kdocs_offline_notified
+def _load_kdocs_monitor_config():
+    import database

+    cfg = database.get_system_config()
+    if not cfg:
+        return None
+
+    kdocs_enabled = _to_int(cfg.get("kdocs_enabled"), 0)
+    if not kdocs_enabled:
+        return None
+
+    admin_notify_enabled = _to_int(cfg.get("kdocs_admin_notify_enabled"), 0)
+    admin_notify_email = str(cfg.get("kdocs_admin_notify_email") or "").strip()
+    if (not admin_notify_enabled) or (not admin_notify_email):
+        return None
+
+    return admin_notify_email
+
+
+def _is_kdocs_offline(status: dict) -> tuple[bool, bool, bool | None]:
+    login_required = bool(status.get("login_required", False))
+    last_login_ok = status.get("last_login_ok")
+    is_offline = login_required or (last_login_ok is False)
+    return is_offline, login_required, last_login_ok
+
+
+def _send_kdocs_offline_alert(admin_notify_email: str, *, login_required: bool, last_login_ok) -> bool:
    try:
-        import database
-        from services.kdocs_uploader import get_kdocs_uploader
+        import email_service

-        # 获取系统配置
-        cfg = database.get_system_config()
-        if not cfg:
-            return
-
-        # 检查是否启用了金山文档功能
-        kdocs_enabled = int(cfg.get("kdocs_enabled") or 0)
-        if not kdocs_enabled:
-            return
-
-        # 检查是否启用了管理员通知
-        admin_notify_enabled = int(cfg.get("kdocs_admin_notify_enabled") or 0)
-        admin_notify_email = (cfg.get("kdocs_admin_notify_email") or "").strip()
-        if not admin_notify_enabled or not admin_notify_email:
-            return
-
-        # 获取金山文档状态
-        kdocs = get_kdocs_uploader()
-        status = kdocs.get_status()
-        login_required = status.get("login_required", False)
-        last_login_ok = status.get("last_login_ok")
-
-        # 如果需要登录或最后登录状态不是成功
-        is_offline = login_required or (last_login_ok is False)
-
-        if is_offline:
-            # 已经通知过了，不再重复通知
-            if _kdocs_offline_notified:
-                logger.debug("[KDocs监控] 金山文档离线，已通知过，跳过重复通知")
-                return
-
-            # 发送邮件通知
-            try:
-                import email_service
-
-                now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                subject = "【金山文档离线告警】需要重新登录"
-                body = f"""
+        now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        subject = "【金山文档离线告警】需要重新登录"
+        body = f"""
 您好，

 系统检测到金山文档上传功能已离线，需要重新扫码登录。
@@ -155,58 +196,92 @@ def check_kdocs_online_status() -> None:
 ---
 此邮件由系统自动发送，请勿直接回复。
 """
-                email_service.send_email_async(
-                    to_email=admin_notify_email,
-                    subject=subject,
-                    body=body,
-                    email_type="kdocs_offline_alert",
-                )
-                _kdocs_offline_notified = True  # 标记为已通知
-                logger.warning(f"[KDocs监控] 金山文档离线，已发送通知邮件到 {admin_notify_email}")
-            except Exception as e:
-                logger.error(f"[KDocs监控] 发送离线通知邮件失败: {e}")
-        else:
-            # 恢复在线，重置通知状态
+        email_service.send_email_async(
+            to_email=admin_notify_email,
+            subject=subject,
+            body=body,
+            email_type="kdocs_offline_alert",
+        )
+        logger.warning(f"[KDocs监控] 金山文档离线，已发送通知邮件到 {admin_notify_email}")
+        return True
+    except Exception as e:
+        logger.error(f"[KDocs监控] 发送离线通知邮件失败: {e}")
+        return False
+
+
+def check_kdocs_online_status() -> None:
+    """检测金山文档登录状态，如果离线则发送邮件通知管理员（每次掉线只通知一次）"""
+    global _kdocs_offline_notified
+
+    try:
+        admin_notify_email = _load_kdocs_monitor_config()
+        if not admin_notify_email:
+            return
+
+        from services.kdocs_uploader import get_kdocs_uploader
+
+        kdocs = get_kdocs_uploader()
+        status = kdocs.get_status() or {}
+        is_offline, login_required, last_login_ok = _is_kdocs_offline(status)
+
+        if is_offline:
            if _kdocs_offline_notified:
-                logger.info("[KDocs监控] 金山文档已恢复在线，重置通知状态")
-                _kdocs_offline_notified = False
-            logger.debug("[KDocs监控] 金山文档状态正常")
+                logger.debug("[KDocs监控] 金山文档离线，已通知过，跳过重复通知")
+                return
+
+            if _send_kdocs_offline_alert(
+                admin_notify_email,
+                login_required=login_required,
+                last_login_ok=last_login_ok,
+            ):
+                _kdocs_offline_notified = True
+            return
+
+        if _kdocs_offline_notified:
+            logger.info("[KDocs监控] 金山文档已恢复在线，重置通知状态")
+            _kdocs_offline_notified = False
+        logger.debug("[KDocs监控] 金山文档状态正常")

    except Exception as e:
        logger.error(f"[KDocs监控] 检测失败: {e}")


-def start_cleanup_scheduler() -> None:
-    """启动定期清理调度器"""
-
-    def cleanup_loop():
+def _start_daemon_loop(name: str, *, startup_delay: float, interval_seconds: float, job, error_tag: str):
+    def loop():
+        if startup_delay > 0:
+            time.sleep(startup_delay)
        while True:
            try:
-                time.sleep(300)  # 每5分钟执行一次清理
-                cleanup_expired_data()
+                job()
+                time.sleep(interval_seconds)
            except Exception as e:
-                logger.error(f"清理任务执行失败: {e}")
+                logger.error(f"{error_tag}: {e}")
+                time.sleep(min(60.0, max(1.0, interval_seconds / 5.0)))

-    cleanup_thread = threading.Thread(target=cleanup_loop, daemon=True, name="cleanup-scheduler")
-    cleanup_thread.start()
+    thread = threading.Thread(target=loop, daemon=True, name=name)
+    thread.start()
+    return thread
+
+
+def start_cleanup_scheduler() -> None:
+    """启动定期清理调度器"""
+    _start_daemon_loop(
+        "cleanup-scheduler",
+        startup_delay=300,
+        interval_seconds=300,
+        job=cleanup_expired_data,
+        error_tag="清理任务执行失败",
+    )
    logger.info("内存清理调度器已启动")


 def start_kdocs_monitor() -> None:
    """启动金山文档状态监控"""
-
-    def monitor_loop():
-        # 启动后等待 60 秒再开始检测（给系统初始化的时间）
-        time.sleep(60)
-        while True:
-            try:
-                check_kdocs_online_status()
-                time.sleep(300)  # 每5分钟检测一次
-            except Exception as e:
-                logger.error(f"[KDocs监控] 监控任务执行失败: {e}")
-                time.sleep(60)
-
-    monitor_thread = threading.Thread(target=monitor_loop, daemon=True, name="kdocs-monitor")
-    monitor_thread.start()
+    _start_daemon_loop(
+        "kdocs-monitor",
+        startup_delay=60,
+        interval_seconds=300,
+        job=check_kdocs_online_status,
+        error_tag="[KDocs监控] 监控任务执行失败",
+    )
    logger.info("[KDocs监控] 金山文档状态监控已启动（每5分钟检测一次）")
-