perf(db): tune sqlite pool and add maintenance scheduler

2026-02-07 12:53:43 +08:00
parent d77e439712
commit ff67a9bbab
5 changed files with 170 additions and 11 deletions
--- a/.env.example
+++ b/.env.example
@@ -18,6 +18,17 @@ SESSION_COOKIE_SECURE=false  # 使用HTTPS时设为true
 # ==================== 数据库配置 ====================
 DB_FILE=data/app_data.db
 DB_POOL_SIZE=5
 DB_CONNECT_TIMEOUT_SECONDS=10
 DB_BUSY_TIMEOUT_MS=10000
 DB_CACHE_SIZE_KB=8192
 DB_WAL_AUTOCHECKPOINT_PAGES=1000
 DB_MMAP_SIZE_MB=256
 DB_LOCK_RETRY_COUNT=3
 DB_LOCK_RETRY_BASE_MS=50
 DB_PRAGMA_OPTIMIZE_INTERVAL_SECONDS=21600
 DB_ANALYZE_INTERVAL_SECONDS=86400
 DB_WAL_CHECKPOINT_INTERVAL_SECONDS=43200
 DB_WAL_CHECKPOINT_MODE=PASSIVE
 # ==================== 并发控制配置 ====================
 MAX_CONCURRENT_GLOBAL=2
--- a/app.py
+++ b/app.py
@@ -35,7 +35,7 @@ from realtime.status_push import status_push_worker
 from routes import register_blueprints
 from security import init_security_middleware
 from services.checkpoints import init_checkpoint_manager
-from services.maintenance import start_cleanup_scheduler, start_kdocs_monitor
+from services.maintenance import start_cleanup_scheduler, start_database_maintenance_scheduler, start_kdocs_monitor
 from services.request_metrics import record_request_metric
 from services.models import User
 from services.runtime import init_runtime
@@ -407,6 +407,7 @@ if __name__ == "__main__":
    _init_optional_email_service()
    start_cleanup_scheduler()
    start_database_maintenance_scheduler()
    start_kdocs_monitor()
    _load_and_apply_scheduler_limits()
--- a/app_config.py
+++ b/app_config.py
@@ -126,6 +126,17 @@ class Config:
    # ==================== 数据库配置 ====================
    DB_FILE = os.environ.get("DB_FILE", "data/app_data.db")
    DB_POOL_SIZE = int(os.environ.get("DB_POOL_SIZE", "5"))
    DB_CONNECT_TIMEOUT_SECONDS = int(os.environ.get("DB_CONNECT_TIMEOUT_SECONDS", "10"))
    DB_BUSY_TIMEOUT_MS = int(os.environ.get("DB_BUSY_TIMEOUT_MS", "10000"))
    DB_CACHE_SIZE_KB = int(os.environ.get("DB_CACHE_SIZE_KB", "8192"))
    DB_WAL_AUTOCHECKPOINT_PAGES = int(os.environ.get("DB_WAL_AUTOCHECKPOINT_PAGES", "1000"))
    DB_MMAP_SIZE_MB = int(os.environ.get("DB_MMAP_SIZE_MB", "256"))
    DB_LOCK_RETRY_COUNT = int(os.environ.get("DB_LOCK_RETRY_COUNT", "3"))
    DB_LOCK_RETRY_BASE_MS = int(os.environ.get("DB_LOCK_RETRY_BASE_MS", "50"))
    DB_PRAGMA_OPTIMIZE_INTERVAL_SECONDS = int(os.environ.get("DB_PRAGMA_OPTIMIZE_INTERVAL_SECONDS", "21600"))
    DB_ANALYZE_INTERVAL_SECONDS = int(os.environ.get("DB_ANALYZE_INTERVAL_SECONDS", "86400"))
    DB_WAL_CHECKPOINT_INTERVAL_SECONDS = int(os.environ.get("DB_WAL_CHECKPOINT_INTERVAL_SECONDS", "43200"))
    DB_WAL_CHECKPOINT_MODE = os.environ.get("DB_WAL_CHECKPOINT_MODE", "PASSIVE")
    # ==================== 浏览器配置 ====================
    SCREENSHOTS_DIR = os.environ.get("SCREENSHOTS_DIR", "截图")
@@ -249,6 +260,20 @@ class Config:
        if cls.DB_POOL_SIZE < 1:
            errors.append("DB_POOL_SIZE必须大于0")
        if cls.DB_CONNECT_TIMEOUT_SECONDS < 1:
            errors.append("DB_CONNECT_TIMEOUT_SECONDS必须大于0")
        if cls.DB_BUSY_TIMEOUT_MS < 100:
            errors.append("DB_BUSY_TIMEOUT_MS必须至少100毫秒")
        if cls.DB_CACHE_SIZE_KB < 1024:
            errors.append("DB_CACHE_SIZE_KB建议至少1024")
        if cls.DB_WAL_AUTOCHECKPOINT_PAGES < 100:
            errors.append("DB_WAL_AUTOCHECKPOINT_PAGES建议至少100")
        if cls.DB_MMAP_SIZE_MB < 0:
            errors.append("DB_MMAP_SIZE_MB不能为负数")
        if cls.DB_LOCK_RETRY_COUNT < 0:
            errors.append("DB_LOCK_RETRY_COUNT不能为负数")
        if cls.DB_LOCK_RETRY_BASE_MS < 10:
            errors.append("DB_LOCK_RETRY_BASE_MS建议至少10毫秒")
        # 验证日志配置
        if cls.LOG_LEVEL not in ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]:
--- a/db_pool.py
+++ b/db_pool.py
@@ -7,12 +7,28 @@
 import sqlite3
 import threading
 import time
 from queue import Empty, Full, Queue
 from app_config import get_config
 from app_logger import get_logger
 logger = get_logger("database")
 config = get_config()
 DB_CONNECT_TIMEOUT_SECONDS = max(1, int(getattr(config, "DB_CONNECT_TIMEOUT_SECONDS", 10)))
 DB_BUSY_TIMEOUT_MS = max(1000, int(getattr(config, "DB_BUSY_TIMEOUT_MS", 10000)))
 DB_CACHE_SIZE_KB = max(1024, int(getattr(config, "DB_CACHE_SIZE_KB", 8192)))
 DB_WAL_AUTOCHECKPOINT_PAGES = max(100, int(getattr(config, "DB_WAL_AUTOCHECKPOINT_PAGES", 1000)))
 DB_MMAP_SIZE_MB = max(0, int(getattr(config, "DB_MMAP_SIZE_MB", 256)))
 DB_LOCK_RETRY_COUNT = max(0, int(getattr(config, "DB_LOCK_RETRY_COUNT", 3)))
 DB_LOCK_RETRY_BASE_MS = max(10, int(getattr(config, "DB_LOCK_RETRY_BASE_MS", 50)))
 def _is_lock_conflict_error(error: sqlite3.OperationalError) -> bool:
    message = str(error or "").lower()
    return ("locked" in message) or ("busy" in message)
 class ConnectionPool:
@@ -46,16 +62,29 @@ class ConnectionPool:
    def _create_connection(self):
        """创建新的数据库连接"""
-        conn = sqlite3.connect(self.database, check_same_thread=False)
+        conn = sqlite3.connect(
            self.database,
            check_same_thread=False,
            timeout=DB_CONNECT_TIMEOUT_SECONDS,
        )
        conn.row_factory = sqlite3.Row
-        # 启用外键约束，确保 ON DELETE CASCADE 等约束生效
+        pragma_statements = [
-        conn.execute("PRAGMA foreign_keys=ON")
+            "PRAGMA foreign_keys=ON",
-        # 设置WAL模式提高并发性能
+            "PRAGMA journal_mode=WAL",
-        conn.execute("PRAGMA journal_mode=WAL")
+            "PRAGMA synchronous=NORMAL",
-        # 在WAL模式下使用NORMAL同步，兼顾性能与可靠性
+            f"PRAGMA busy_timeout={DB_BUSY_TIMEOUT_MS}",
-        conn.execute("PRAGMA synchronous=NORMAL")
+            "PRAGMA temp_store=MEMORY",
-        # 设置合理的超时时间
+            f"PRAGMA cache_size={-DB_CACHE_SIZE_KB}",
-        conn.execute("PRAGMA busy_timeout=5000")
+            f"PRAGMA wal_autocheckpoint={DB_WAL_AUTOCHECKPOINT_PAGES}",
        ]
        if DB_MMAP_SIZE_MB > 0:
            pragma_statements.append(f"PRAGMA mmap_size={DB_MMAP_SIZE_MB * 1024 * 1024}")
        for statement in pragma_statements:
            try:
                conn.execute(statement)
            except sqlite3.DatabaseError as e:
                logger.warning(f"设置数据库参数失败 ({statement}): {e}")
        return conn
    def _close_connection(self, conn) -> None:
@@ -198,7 +227,20 @@ class PooledConnection:
    def commit(self):
        """提交事务"""
-        self._conn.commit()
+        for attempt in range(DB_LOCK_RETRY_COUNT + 1):
            try:
                self._conn.commit()
                return
            except sqlite3.OperationalError as e:
                if (not _is_lock_conflict_error(e)) or attempt >= DB_LOCK_RETRY_COUNT:
                    raise
                sleep_seconds = (DB_LOCK_RETRY_BASE_MS * (2**attempt)) / 1000.0
                logger.warning(
                    f"数据库提交遇到锁冲突，{sleep_seconds:.3f}s 后重试 "
                    f"({attempt + 1}/{DB_LOCK_RETRY_COUNT})"
                )
                time.sleep(sleep_seconds)
    def rollback(self):
        """回滚事务"""
--- a/services/maintenance.py
+++ b/services/maintenance.py
@@ -29,6 +29,12 @@ config = get_config()
 USER_ACCOUNTS_EXPIRE_SECONDS = int(getattr(config, "USER_ACCOUNTS_EXPIRE_SECONDS", 3600))
 BATCH_TASK_EXPIRE_SECONDS = int(getattr(config, "BATCH_TASK_EXPIRE_SECONDS", 21600))
 PENDING_RANDOM_EXPIRE_SECONDS = int(getattr(config, "PENDING_RANDOM_EXPIRE_SECONDS", 7200))
 DB_PRAGMA_OPTIMIZE_INTERVAL_SECONDS = max(300, int(getattr(config, "DB_PRAGMA_OPTIMIZE_INTERVAL_SECONDS", 21600)))
 DB_ANALYZE_INTERVAL_SECONDS = max(0, int(getattr(config, "DB_ANALYZE_INTERVAL_SECONDS", 86400)))
 DB_WAL_CHECKPOINT_INTERVAL_SECONDS = max(0, int(getattr(config, "DB_WAL_CHECKPOINT_INTERVAL_SECONDS", 43200)))
 DB_WAL_CHECKPOINT_MODE = str(getattr(config, "DB_WAL_CHECKPOINT_MODE", "PASSIVE") or "PASSIVE").upper().strip()
 if DB_WAL_CHECKPOINT_MODE not in {"PASSIVE", "FULL", "RESTART", "TRUNCATE"}:
    DB_WAL_CHECKPOINT_MODE = "PASSIVE"
 # 金山文档离线通知状态：每次掉线只通知一次，恢复在线后重置
 _kdocs_offline_notified: bool = False
@@ -275,6 +281,80 @@ def start_cleanup_scheduler() -> None:
    logger.info("内存清理调度器已启动")
 def _execute_db_statement(statement: str, *, commit: bool = False, fetchone: bool = False):
    import db_pool
    with db_pool.get_db() as conn:
        conn.execute(statement)
        row = conn.fetchone() if fetchone else None
        if commit:
            conn.commit()
        return row
 def optimize_database_runtime() -> None:
    """执行 SQLite 运行期优化，提升查询计划和页缓存命中率。"""
    row = _execute_db_statement("PRAGMA optimize", fetchone=True)
    if row:
        logger.debug(f"[DB维护] PRAGMA optimize 已执行: {tuple(row)}")
    else:
        logger.debug("[DB维护] PRAGMA optimize 已执行")
 def analyze_database_stats() -> None:
    """执行 ANALYZE，刷新统计信息，提升复杂查询稳定性。"""
    _execute_db_statement("ANALYZE", commit=True)
    logger.info("[DB维护] ANALYZE 已完成")
 def checkpoint_database_wal() -> None:
    """定期执行 WAL checkpoint，控制 WAL 文件体积。"""
    row = _execute_db_statement(
        f"PRAGMA wal_checkpoint({DB_WAL_CHECKPOINT_MODE})",
        fetchone=True,
    )
    if row:
        logger.debug(f"[DB维护] WAL checkpoint({DB_WAL_CHECKPOINT_MODE}) 结果: {tuple(row)}")
    else:
        logger.debug(f"[DB维护] WAL checkpoint({DB_WAL_CHECKPOINT_MODE}) 已执行")
 def start_database_maintenance_scheduler() -> None:
    """启动数据库维护调度器。"""
    _start_daemon_loop(
        "db-optimize",
        startup_delay=180,
        interval_seconds=DB_PRAGMA_OPTIMIZE_INTERVAL_SECONDS,
        job=optimize_database_runtime,
        error_tag="[DB维护] PRAGMA optimize 执行失败",
    )
    if DB_ANALYZE_INTERVAL_SECONDS > 0:
        _start_daemon_loop(
            "db-analyze",
            startup_delay=300,
            interval_seconds=DB_ANALYZE_INTERVAL_SECONDS,
            job=analyze_database_stats,
            error_tag="[DB维护] ANALYZE 执行失败",
        )
    if DB_WAL_CHECKPOINT_INTERVAL_SECONDS > 0:
        _start_daemon_loop(
            "db-wal-checkpoint",
            startup_delay=420,
            interval_seconds=DB_WAL_CHECKPOINT_INTERVAL_SECONDS,
            job=checkpoint_database_wal,
            error_tag="[DB维护] WAL checkpoint 执行失败",
        )
    logger.info(
        "[DB维护] 调度器已启动: "
        f"optimize={DB_PRAGMA_OPTIMIZE_INTERVAL_SECONDS}s, "
        f"analyze={DB_ANALYZE_INTERVAL_SECONDS}s, "
        f"checkpoint={DB_WAL_CHECKPOINT_INTERVAL_SECONDS}s({DB_WAL_CHECKPOINT_MODE})"
    )
 def start_kdocs_monitor() -> None:
    """启动金山文档状态监控"""
    _start_daemon_loop(