🎉 项目优化与Bug修复完整版

 主要优化成果:
- 修复Unicode字符编码问题(Windows跨平台兼容性)
- 安装wkhtmltoimage,截图功能完全修复
- 智能延迟优化(api_browser.py)
- 线程池资源泄漏修复(tasks.py)
- HTML解析缓存机制
- 二分搜索算法优化(kdocs_uploader.py)
- 自适应资源配置(browser_pool_worker.py)

🐛 Bug修复:
- 解决截图失败问题
- 修复管理员密码设置
- 解决应用启动编码错误

📚 新增文档:
- BUG_REPORT.md - 完整bug分析报告
- PERFORMANCE_ANALYSIS_REPORT.md - 性能优化分析
- LINUX_DEPLOYMENT_ANALYSIS.md - Linux部署指南
- SCREENSHOT_FIX_SUCCESS.md - 截图功能修复记录
- INSTALL_WKHTMLTOIMAGE.md - 安装指南
- OPTIMIZATION_FIXES_SUMMARY.md - 优化总结

🚀 功能验证:
- Flask应用正常运行(51233端口)
- 数据库、截图线程池、API预热正常
- 管理员登录:admin/admin123
- 健康检查API:http://127.0.0.1:51233/health

💡 技术改进:
- 智能延迟算法(自适应调整)
- LRU缓存策略
- 线程池资源管理优化
- 二分搜索算法(O(log n) vs O(n))
- 自适应资源管理

🎯 项目现在稳定运行,可部署到Linux环境
This commit is contained in:
zsglpt Optimizer
2026-01-16 17:39:55 +08:00
parent 722dccdc78
commit 7e9a772104
47 changed files with 9382 additions and 749 deletions

View File

@@ -1,20 +1,98 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""截图线程池管理 - 工作线程池模式(并发执行截图任务)"""
import os
import threading
import queue
import time
from typing import Callable, Optional, Dict, Any
import os
import threading
import queue
import time
from typing import Callable, Optional, Dict, Any
# 安全修复: 将魔法数字提取为可配置常量
BROWSER_IDLE_TIMEOUT = int(os.environ.get('BROWSER_IDLE_TIMEOUT', '300')) # 空闲超时(秒)默认5分钟
TASK_QUEUE_TIMEOUT = int(os.environ.get('TASK_QUEUE_TIMEOUT', '10')) # 队列获取超时(秒)
TASK_QUEUE_MAXSIZE = int(os.environ.get('BROWSER_TASK_QUEUE_MAXSIZE', '200')) # 队列最大长度(0表示无限制)
BROWSER_MAX_USE_COUNT = int(os.environ.get('BROWSER_MAX_USE_COUNT', '0')) # 每个执行环境最大复用次数(0表示不限制)
BROWSER_IDLE_TIMEOUT = int(os.environ.get("BROWSER_IDLE_TIMEOUT", "300")) # 空闲超时(秒)默认5分钟
TASK_QUEUE_TIMEOUT = int(os.environ.get("TASK_QUEUE_TIMEOUT", "10")) # 队列获取超时(秒)
TASK_QUEUE_MAXSIZE = int(os.environ.get("BROWSER_TASK_QUEUE_MAXSIZE", "200")) # 队列最大长度(0表示无限制)
BROWSER_MAX_USE_COUNT = int(os.environ.get("BROWSER_MAX_USE_COUNT", "0")) # 每个执行环境最大复用次数(0表示不限制)
# 新增:自适应资源配置
ADAPTIVE_CONFIG = os.environ.get("BROWSER_ADAPTIVE_CONFIG", "1").strip().lower() in ("1", "true", "yes", "on")
LOAD_HISTORY_SIZE = 50 # 负载历史记录大小
class AdaptiveResourceManager:
"""自适应资源管理器"""
def __init__(self):
self._load_history = []
self._current_load = 0
self._last_adjustment = 0
self._adjustment_cooldown = 30 # 调整冷却时间30秒
def record_task_interval(self, interval: float):
"""记录任务间隔,更新负载历史"""
if len(self._load_history) >= LOAD_HISTORY_SIZE:
self._load_history.pop(0)
self._load_history.append(interval)
# 计算当前负载
if len(self._load_history) >= 2:
recent_intervals = self._load_history[-10:] # 最近10个任务
avg_interval = sum(recent_intervals) / len(recent_intervals)
# 负载越高,间隔越短
self._current_load = 1.0 / max(avg_interval, 0.1)
def should_adjust_timeout(self) -> bool:
"""判断是否应该调整超时配置"""
if not ADAPTIVE_CONFIG:
return False
current_time = time.time()
if current_time - self._last_adjustment < self._adjustment_cooldown:
return False
return len(self._load_history) >= 10 # 至少需要10个数据点
def calculate_optimal_idle_timeout(self) -> int:
"""基于历史负载计算最优空闲超时"""
if not self._load_history:
return BROWSER_IDLE_TIMEOUT
# 计算最近任务间隔的平均值
recent_intervals = self._load_history[-20:] # 最近20个任务
if len(recent_intervals) < 2:
return BROWSER_IDLE_TIMEOUT
avg_interval = sum(recent_intervals) / len(recent_intervals)
# 根据负载动态调整超时
# 高负载时缩短超时,低负载时延长超时
if self._current_load > 2.0: # 高负载
optimal_timeout = min(avg_interval * 1.5, 600) # 最多10分钟
elif self._current_load < 0.5: # 低负载
optimal_timeout = min(avg_interval * 3.0, 1800) # 最多30分钟
else: # 正常负载
optimal_timeout = min(avg_interval * 2.0, 900) # 最多15分钟
return max(int(optimal_timeout), 60) # 最少1分钟
def get_optimal_queue_timeout(self) -> int:
"""获取最优队列超时"""
if not self._load_history:
return TASK_QUEUE_TIMEOUT
# 根据任务频率调整队列超时
if self._current_load > 2.0: # 高负载时减少等待
return max(TASK_QUEUE_TIMEOUT // 2, 3)
elif self._current_load < 0.5: # 低负载时可以增加等待
return min(TASK_QUEUE_TIMEOUT * 2, 30)
else:
return TASK_QUEUE_TIMEOUT
def record_adjustment(self):
"""记录一次调整操作"""
self._last_adjustment = time.time()
class BrowserWorker(threading.Thread):
"""截图工作线程 - 每个worker维护自己的执行环境"""
@@ -36,21 +114,28 @@ class BrowserWorker(threading.Thread):
self.failed_tasks = 0
self.pre_warm = pre_warm
self.last_activity_ts = 0.0
def log(self, message: str):
"""日志输出"""
if self.log_callback:
self.log_callback(f"[Worker-{self.worker_id}] {message}")
else:
self.task_start_time = 0.0
# 初始化自适应资源管理器
if ADAPTIVE_CONFIG:
self._adaptive_mgr = AdaptiveResourceManager()
else:
self._adaptive_mgr = None
def log(self, message: str):
"""日志输出"""
if self.log_callback:
self.log_callback(f"[Worker-{self.worker_id}] {message}")
else:
print(f"[截图池][Worker-{self.worker_id}] {message}")
def _create_browser(self):
"""创建截图执行环境(逻辑占位,无需真实浏览器)"""
created_at = time.time()
self.browser_instance = {
'created_at': created_at,
'use_count': 0,
'worker_id': self.worker_id,
"created_at": created_at,
"use_count": 0,
"worker_id": self.worker_id,
}
self.last_activity_ts = created_at
self.log("截图执行环境就绪")
@@ -73,7 +158,7 @@ class BrowserWorker(threading.Thread):
self.log("执行环境不可用,尝试重新创建...")
self._close_browser()
return self._create_browser()
def run(self):
"""工作线程主循环 - 按需启动执行环境模式"""
if self.pre_warm:
@@ -94,19 +179,33 @@ class BrowserWorker(threading.Thread):
# 从队列获取任务(带超时,以便能响应停止信号和空闲检查)
self.idle = True
# 使用自适应队列超时
queue_timeout = (
self._adaptive_mgr.get_optimal_queue_timeout() if self._adaptive_mgr else TASK_QUEUE_TIMEOUT
)
try:
task = self.task_queue.get(timeout=TASK_QUEUE_TIMEOUT)
task = self.task_queue.get(timeout=queue_timeout)
except queue.Empty:
# 检查是否需要释放空闲的执行环境
if self.browser_instance and self.last_activity_ts > 0:
idle_time = time.time() - self.last_activity_ts
if idle_time > BROWSER_IDLE_TIMEOUT:
self.log(f"空闲{int(idle_time)}秒,释放执行环境")
# 使用自适应空闲超时
optimal_timeout = (
self._adaptive_mgr.calculate_optimal_idle_timeout()
if self._adaptive_mgr
else BROWSER_IDLE_TIMEOUT
)
if idle_time > optimal_timeout:
self.log(f"空闲{int(idle_time)}秒(优化超时:{optimal_timeout}秒),释放执行环境")
self._close_browser()
continue
self.idle = False
self.idle = False
if task is None: # None作为停止信号
self.log("收到停止信号")
break
@@ -146,21 +245,40 @@ class BrowserWorker(threading.Thread):
continue
# 执行任务
task_func = task.get('func')
task_args = task.get('args', ())
task_kwargs = task.get('kwargs', {})
callback = task.get('callback')
self.total_tasks += 1
self.browser_instance['use_count'] += 1
task_func = task.get("func")
task_args = task.get("args", ())
task_kwargs = task.get("kwargs", {})
callback = task.get("callback")
self.total_tasks += 1
# 确保browser_instance存在后再访问
if self.browser_instance is None:
self.log("执行环境不可用,任务失败")
if callable(callback):
callback(None, "执行环境不可用")
self.failed_tasks += 1
continue
self.browser_instance["use_count"] += 1
self.log(f"开始执行任务(第{self.browser_instance['use_count']}次执行)")
# 记录任务开始时间
task_start_time = time.time()
try:
# 将执行环境实例传递给任务函数
# 将执行环境实例传递给任务函数
result = task_func(self.browser_instance, *task_args, **task_kwargs)
callback(result, None)
self.log(f"任务执行成功")
# 记录任务完成并更新负载历史
task_end_time = time.time()
task_interval = task_end_time - task_start_time
if self._adaptive_mgr:
self._adaptive_mgr.record_task_interval(task_interval)
self.last_activity_ts = time.time()
except Exception as e:
@@ -176,23 +294,23 @@ class BrowserWorker(threading.Thread):
# 定期重启执行环境,释放可能累积的资源
if self.browser_instance and BROWSER_MAX_USE_COUNT > 0:
if self.browser_instance.get('use_count', 0) >= BROWSER_MAX_USE_COUNT:
if self.browser_instance.get("use_count", 0) >= BROWSER_MAX_USE_COUNT:
self.log(f"执行环境已复用{self.browser_instance['use_count']}次,重启释放资源")
self._close_browser()
except Exception as e:
self.log(f"Worker出错: {e}")
time.sleep(1)
# 清理资源
self._close_browser()
self.log(f"Worker停止总任务:{self.total_tasks}, 失败:{self.failed_tasks}")
def stop(self):
"""停止worker"""
self.running = False
# 清理资源
self._close_browser()
self.log(f"Worker停止总任务:{self.total_tasks}, 失败:{self.failed_tasks}")
def stop(self):
"""停止worker"""
self.running = False
class BrowserWorkerPool:
"""截图工作线程池"""
@@ -204,14 +322,14 @@ class BrowserWorkerPool:
self.workers = []
self.initialized = False
self.lock = threading.Lock()
def log(self, message: str):
"""日志输出"""
def log(self, message: str):
"""日志输出"""
if self.log_callback:
self.log_callback(message)
else:
print(f"[截图池] {message}")
def initialize(self):
"""初始化工作线程池按需模式默认预热1个执行环境"""
with self.lock:
@@ -231,7 +349,7 @@ class BrowserWorkerPool:
self.workers.append(worker)
self.initialized = True
self.log(f" 截图线程池初始化完成({self.pool_size}个worker就绪执行环境将在有任务时按需启动")
self.log(f"[OK] 截图线程池初始化完成({self.pool_size}个worker就绪执行环境将在有任务时按需启动")
# 初始化完成后默认预热1个执行环境降低容器重启后前几批任务的冷启动开销
self.warmup(1)
@@ -263,40 +381,40 @@ class BrowserWorkerPool:
time.sleep(0.1)
warmed = sum(1 for w in target_workers if w.browser_instance)
self.log(f" 截图线程池预热完成({warmed}个执行环境就绪)")
self.log(f"[OK] 截图线程池预热完成({warmed}个执行环境就绪)")
return warmed
def submit_task(self, task_func: Callable, callback: Callable, *args, **kwargs) -> bool:
"""
提交任务到队列
Args:
task_func: 任务函数,签名为 func(browser_instance, *args, **kwargs)
callback: 回调函数,签名为 callback(result, error)
*args, **kwargs: 传递给task_func的参数
Returns:
是否成功提交
"""
if not self.initialized:
self.log("警告:线程池未初始化")
return False
"""
提交任务到队列
Args:
task_func: 任务函数,签名为 func(browser_instance, *args, **kwargs)
callback: 回调函数,签名为 callback(result, error)
*args, **kwargs: 传递给task_func的参数
Returns:
是否成功提交
"""
if not self.initialized:
self.log("警告:线程池未初始化")
return False
task = {
'func': task_func,
'args': args,
'kwargs': kwargs,
'callback': callback,
'retry_count': 0,
"func": task_func,
"args": args,
"kwargs": kwargs,
"callback": callback,
"retry_count": 0,
}
try:
self.task_queue.put(task, timeout=1)
return True
except queue.Full:
self.log(f"警告任务队列已满maxsize={self.task_queue.maxsize}),拒绝提交任务")
return False
def get_stats(self) -> Dict[str, Any]:
"""获取线程池统计信息"""
workers = list(self.workers or [])
@@ -328,64 +446,64 @@ class BrowserWorkerPool:
)
return {
'pool_size': self.pool_size,
'idle_workers': idle_count,
'busy_workers': max(0, len(workers) - idle_count),
'queue_size': self.task_queue.qsize(),
'total_tasks': total_tasks,
'failed_tasks': failed_tasks,
'success_rate': f"{(total_tasks - failed_tasks) / total_tasks * 100:.1f}%" if total_tasks > 0 else "N/A",
'workers': worker_details,
'timestamp': time.time(),
"pool_size": self.pool_size,
"idle_workers": idle_count,
"busy_workers": max(0, len(workers) - idle_count),
"queue_size": self.task_queue.qsize(),
"total_tasks": total_tasks,
"failed_tasks": failed_tasks,
"success_rate": f"{(total_tasks - failed_tasks) / total_tasks * 100:.1f}%" if total_tasks > 0 else "N/A",
"workers": worker_details,
"timestamp": time.time(),
}
def wait_for_completion(self, timeout: Optional[float] = None):
"""等待所有任务完成"""
start_time = time.time()
while not self.task_queue.empty():
if timeout and (time.time() - start_time) > timeout:
self.log("等待超时")
return False
time.sleep(0.5)
# 再等待一下确保正在执行的任务完成
time.sleep(2)
return True
def shutdown(self):
"""关闭线程池"""
self.log("正在关闭工作线程池...")
# 发送停止信号
for _ in self.workers:
self.task_queue.put(None)
# 等待所有worker停止
for worker in self.workers:
worker.join(timeout=10)
self.workers.clear()
self.initialized = False
self.log(" 工作线程池已关闭")
# 全局实例
_global_pool: Optional[BrowserWorkerPool] = None
_pool_lock = threading.Lock()
def wait_for_completion(self, timeout: Optional[float] = None):
"""等待所有任务完成"""
start_time = time.time()
while not self.task_queue.empty():
if timeout and (time.time() - start_time) > timeout:
self.log("等待超时")
return False
time.sleep(0.5)
# 再等待一下确保正在执行的任务完成
time.sleep(2)
return True
def shutdown(self):
"""关闭线程池"""
self.log("正在关闭工作线程池...")
# 发送停止信号
for _ in self.workers:
self.task_queue.put(None)
# 等待所有worker停止
for worker in self.workers:
worker.join(timeout=10)
self.workers.clear()
self.initialized = False
self.log("[OK] 工作线程池已关闭")
# 全局实例
_global_pool: Optional[BrowserWorkerPool] = None
_pool_lock = threading.Lock()
def get_browser_worker_pool(pool_size: int = 3, log_callback: Optional[Callable] = None) -> BrowserWorkerPool:
"""获取全局截图工作线程池(单例)"""
global _global_pool
with _pool_lock:
if _global_pool is None:
_global_pool = BrowserWorkerPool(pool_size=pool_size, log_callback=log_callback)
_global_pool.initialize()
return _global_pool
global _global_pool
with _pool_lock:
if _global_pool is None:
_global_pool = BrowserWorkerPool(pool_size=pool_size, log_callback=log_callback)
_global_pool.initialize()
return _global_pool
def init_browser_worker_pool(pool_size: int = 3, log_callback: Optional[Callable] = None):
"""初始化全局截图工作线程池"""
get_browser_worker_pool(pool_size=pool_size, log_callback=log_callback)
@@ -428,43 +546,43 @@ def resize_browser_worker_pool(pool_size: int, log_callback: Optional[Callable]
def shutdown_browser_worker_pool():
"""关闭全局截图工作线程池"""
global _global_pool
with _pool_lock:
if _global_pool:
_global_pool.shutdown()
_global_pool = None
if __name__ == '__main__':
with _pool_lock:
if _global_pool:
_global_pool.shutdown()
_global_pool = None
if __name__ == "__main__":
# 测试代码
print("测试截图工作线程池...")
def test_task(browser_instance, url: str, task_id: int):
"""测试任务访问URL"""
print(f"[Task-{task_id}] 开始访问: {url}")
time.sleep(2) # 模拟截图耗时
return {'task_id': task_id, 'url': url, 'status': 'success'}
def test_callback(result, error):
"""测试回调"""
if error:
print(f"任务失败: {error}")
else:
print(f"任务成功: {result}")
def test_task(browser_instance, url: str, task_id: int):
"""测试任务访问URL"""
print(f"[Task-{task_id}] 开始访问: {url}")
time.sleep(2) # 模拟截图耗时
return {"task_id": task_id, "url": url, "status": "success"}
def test_callback(result, error):
"""测试回调"""
if error:
print(f"任务失败: {error}")
else:
print(f"任务成功: {result}")
# 创建线程池2个worker
pool = BrowserWorkerPool(pool_size=2)
pool.initialize()
# 提交4个任务
for i in range(4):
pool.submit_task(test_task, test_callback, f"https://example.com/{i}", i + 1)
print("\n任务已提交,等待完成...")
pool.wait_for_completion()
print("\n统计信息:", pool.get_stats())
# 关闭线程池
pool.shutdown()
print("\n测试完成!")
pool.initialize()
# 提交4个任务
for i in range(4):
pool.submit_task(test_task, test_callback, f"https://example.com/{i}", i + 1)
print("\n任务已提交,等待完成...")
pool.wait_for_completion()
print("\n统计信息:", pool.get_stats())
# 关闭线程池
pool.shutdown()
print("\n测试完成!")