From 41ead4bead3307fc8b089f2ee42bc284338a53cd Mon Sep 17 00:00:00 2001 From: yuyx <237899745@qq.com> Date: Wed, 31 Dec 2025 16:50:35 +0800 Subject: [PATCH] replace screenshot pipeline and update admin --- Dockerfile | 10 +- README.md | 42 +- admin-frontend/src/api/security.js | 6 +- admin-frontend/src/pages/ReportPage.vue | 34 +- admin-frontend/src/pages/SecurityPage.vue | 39 + admin-frontend/src/pages/SystemPage.vue | 2 +- api_browser.py | 82 +- app.py | 14 +- browser_installer.py | 214 --- browser_pool_worker.py | 240 ++- docker-compose.yml | 3 - playwright_automation.py | 1585 ------------------ requirements.txt | 2 - routes/admin_api/core.py | 16 +- routes/admin_api/security.py | 16 +- routes/api_accounts.py | 12 - security/risk_scorer.py | 27 + services/browser_manager.py | 112 -- services/screenshots.py | 218 +-- services/tasks.py | 2 +- static/admin/assets/ReportPage-nXQwTJlk.js | 2 +- static/admin/assets/SecurityPage-Czxm2GJx.js | 6 +- static/admin/assets/SystemPage-BPHukDdR.js | 2 +- templates/admin_legacy.html | 2 +- tests/test_browser_pool_worker.py | 5 +- 25 files changed, 443 insertions(+), 2250 deletions(-) delete mode 100755 browser_installer.py delete mode 100755 playwright_automation.py delete mode 100644 services/browser_manager.py diff --git a/Dockerfile b/Dockerfile index ba3a6bb..c152104 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,14 +1,18 @@ # 使用国内镜像源加速 -FROM mcr.microsoft.com/playwright/python:v1.40.0-jammy +FROM python:3.10-slim-bullseye # 设置工作目录 WORKDIR /app # 设置环境变量 ENV PYTHONUNBUFFERED=1 -ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright ENV TZ=Asia/Shanghai +# 安装 wkhtmltopdf(包含 wkhtmltoimage)与中文字体 +RUN apt-get update && \ + apt-get install -y --no-install-recommends wkhtmltopdf fonts-noto-cjk && \ + rm -rf /var/lib/apt/lists/* + # 配置 pip 使用国内镜像源 RUN pip config set global.index-url https://mirrors.aliyun.com/pypi/simple/ && pip config set install.trusted-host mirrors.aliyun.com @@ -22,10 +26,8 @@ RUN pip install --no-cache-dir -r requirements.txt COPY app.py . COPY database.py . COPY db_pool.py . -COPY playwright_automation.py . COPY api_browser.py . COPY browser_pool_worker.py . -COPY browser_installer.py . COPY password_utils.py . COPY crypto_utils.py . COPY task_checkpoint.py . diff --git a/README.md b/README.md index e6d0c17..e642ffd 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,10 @@ ## 项目简介 -本项目是一个 **Docker 容器化应用**,使用 Flask + Playwright + SQLite 构建,提供: +本项目是一个 **Docker 容器化应用**,使用 Flask + Requests + wkhtmltopdf + SQLite 构建,提供: - 多用户注册登录系统 -- 浏览器自动化任务 +- 自动化任务(HTTP 模拟) - 定时任务调度 - 截图管理 - VIP用户管理 @@ -22,7 +22,8 @@ - **后端**: Python 3.8+, Flask - **数据库**: SQLite -- **自动化**: Playwright (Chromium) +- **自动化**: Requests + BeautifulSoup +- **截图**: wkhtmltopdf / wkhtmltoimage - **容器化**: Docker + Docker Compose - **前端**: HTML + JavaScript + Socket.IO @@ -39,10 +40,8 @@ zsglpt/ ├── database.py # 数据库稳定门面(对外 API) ├── db/ # DB 分域实现 + schema/migrations ├── db_pool.py # 数据库连接池 -├── playwright_automation.py # Playwright 自动化 ├── api_browser.py # Requests 自动化(主浏览流程) -├── browser_pool_worker.py # 截图 WorkerPool(浏览器复用) -├── browser_installer.py # 浏览器安装检查 +├── browser_pool_worker.py # 截图 WorkerPool ├── app_config.py # 配置管理 ├── app_logger.py # 日志系统 ├── app_security.py # 安全模块 @@ -122,8 +121,8 @@ cd /www/wwwroot/zsgpt2 ### 步骤4: 创建必要的目录 ```bash -mkdir -p data logs 截图 playwright -chmod 777 data logs 截图 playwright +mkdir -p data logs 截图 +chmod 777 data logs 截图 ``` ### 步骤5: 构建并启动Docker容器 @@ -447,19 +446,19 @@ docker-compose down docker-compose up -d ``` -### 5. 浏览器下载失败 +### 5. 截图工具未安装 -**问题**: Playwright浏览器下载失败 +**问题**: wkhtmltoimage 命令不存在 **解决方案**: ```bash # 进入容器手动安装 docker exec -it knowledge-automation-multiuser bash -playwright install chromium +apt-get update +apt-get install -y wkhtmltopdf -# 或使用国内镜像 -export PLAYWRIGHT_DOWNLOAD_HOST=https://npmmirror.com/mirrors/playwright/ -playwright install chromium +# 验证安装 +wkhtmltoimage --version ``` --- @@ -631,7 +630,12 @@ docker logs knowledge-automation-multiuser | grep "数据库" |--------|------|--------| | TZ | 时区 | Asia/Shanghai | | PYTHONUNBUFFERED | Python输出缓冲 | 1 | -| PLAYWRIGHT_BROWSERS_PATH | 浏览器路径 | /ms-playwright | +| WKHTMLTOIMAGE_PATH | wkhtmltoimage 可执行文件路径 | 自动探测 | +| WKHTMLTOIMAGE_JS_DELAY_MS | JS 等待时间(毫秒) | 3000 | +| WKHTMLTOIMAGE_WIDTH | 截图宽度 | 1920 | +| WKHTMLTOIMAGE_QUALITY | JPG截图质量 | 95 | +| WKHTMLTOIMAGE_TIMEOUT_SECONDS | 截图超时时间(秒) | 60 | +| WKHTMLTOIMAGE_USER_AGENT | 截图使用的 UA | Chrome 120 | --- @@ -641,13 +645,13 @@ docker logs knowledge-automation-multiuser | grep "数据库" - **项目名称**: 知识管理平台自动化工具 - **版本**: Docker 多用户版 -- **技术栈**: Python + Flask + Playwright + SQLite + Docker +- **技术栈**: Python + Flask + Requests + wkhtmltopdf + SQLite + Docker ### 常用文档链接 - [Docker 官方文档](https://docs.docker.com/) - [Flask 官方文档](https://flask.palletsprojects.com/) -- [Playwright 官方文档](https://playwright.dev/python/) +- [wkhtmltopdf 官方文档](https://wkhtmltopdf.org/) ### 故障排查 @@ -683,8 +687,8 @@ ssh root@your-ip # 3. 进入目录并创建必要目录 cd /www/wwwroot/zsgpt2 -mkdir -p data logs 截图 playwright -chmod 777 data logs 截图 playwright +mkdir -p data logs 截图 +chmod 777 data logs 截图 # 4. 启动容器 docker-compose up -d diff --git a/admin-frontend/src/api/security.js b/admin-frontend/src/api/security.js index 7d117f1..a7aed95 100644 --- a/admin-frontend/src/api/security.js +++ b/admin-frontend/src/api/security.js @@ -46,6 +46,11 @@ export async function getIpRisk(ip) { return data } +export async function clearIpRisk(ip) { + const { data } = await api.post('/admin/security/ip-risk/clear', { ip }) + return data +} + export async function getUserRisk(userId) { const safeUserId = encodeURIComponent(String(userId || '').trim()) const { data } = await api.get(`/admin/security/user-risk/${safeUserId}`) @@ -56,4 +61,3 @@ export async function cleanup() { const { data } = await api.post('/admin/security/cleanup', {}) return data } - diff --git a/admin-frontend/src/pages/ReportPage.vue b/admin-frontend/src/pages/ReportPage.vue index 9090f37..4ed91c6 100644 --- a/admin-frontend/src/pages/ReportPage.vue +++ b/admin-frontend/src/pages/ReportPage.vue @@ -25,6 +25,7 @@ const refreshStats = inject('refreshStats', null) const adminStats = inject('adminStats', null) const loading = ref(false) +const refreshing = ref(false) const lastUpdatedAt = ref('') const taskStats = ref(null) @@ -181,9 +182,13 @@ const runningCountsLabel = computed(() => { return `运行中 ${runningCount} / 排队 ${queuingCount} / 并发上限 ${maxGlobal || maxConcurrentGlobal.value || '-'}` }) -async function refreshAll() { - if (loading.value) return - loading.value = true +async function refreshAll(options = {}) { + const showLoading = options.showLoading ?? true + if (refreshing.value) return + refreshing.value = true + if (showLoading) { + loading.value = true + } try { const [ taskResult, @@ -217,15 +222,22 @@ async function refreshAll() { await refreshStats?.() recordUpdatedAt() } finally { - loading.value = false + refreshing.value = false + if (showLoading) { + loading.value = false + } } } let refreshTimer = null +function manualRefresh() { + return refreshAll({ showLoading: true }) +} + onMounted(() => { - refreshAll() - refreshTimer = setInterval(refreshAll, 1000) + refreshAll({ showLoading: false }) + refreshTimer = setInterval(() => refreshAll({ showLoading: false }), 1000) }) onUnmounted(() => { @@ -252,7 +264,7 @@ onUnmounted(() => {
- 刷新 + 刷新
@@ -593,9 +605,9 @@ onUnmounted(() => {
-
浏览器池
+
截图线程池
- 活跃(有浏览器){{ browserPoolActiveWorkers }} · 忙碌 {{ browserPoolBusyWorkers }} · 队列 {{ browserPoolQueueSize }} + 活跃(有执行环境){{ browserPoolActiveWorkers }} · 忙碌 {{ browserPoolBusyWorkers }} · 队列 {{ browserPoolQueueSize }}
@@ -609,7 +621,7 @@ onUnmounted(() => {
{{ browserPoolActiveWorkers }}
-
活跃(有浏览器)
+
活跃(有执行环境)
{{ browserPoolIdleWorkers }}
@@ -645,7 +657,7 @@ onUnmounted(() => { - +
diff --git a/admin-frontend/src/pages/SecurityPage.vue b/admin-frontend/src/pages/SecurityPage.vue index f1c0775..fb0c3b0 100644 --- a/admin-frontend/src/pages/SecurityPage.vue +++ b/admin-frontend/src/pages/SecurityPage.vue @@ -6,6 +6,7 @@ import { banIp, banUser, cleanup, + clearIpRisk, getBannedIps, getBannedUsers, getDashboard, @@ -381,6 +382,35 @@ async function unbanFromRisk() { } } +async function clearIpRiskScore() { + if (riskResultKind.value !== 'ip') return + const ipText = String(riskResult.value?.ip || '').trim() + if (!ipText) return + + try { + await ElMessageBox.confirm( + `确定清除 IP ${ipText} 的风险分吗?\n\n清除风险分不会删除威胁历史,也不会解除封禁。`, + '清除风险分', + { confirmButtonText: '清除', cancelButtonText: '取消', type: 'warning' }, + ) + } catch { + return + } + + if (riskLoading.value) return + riskLoading.value = true + try { + await clearIpRisk(ipText) + ElMessage.success('IP风险分已清零') + } catch { + // handled by interceptor + } finally { + riskLoading.value = false + } + + await queryIpRisk() +} + const cleanupLoading = ref(false) async function onCleanup() { @@ -613,6 +643,15 @@ onMounted(async () => {
封禁 解除封禁 + + 清除风险分 +
diff --git a/admin-frontend/src/pages/SystemPage.vue b/admin-frontend/src/pages/SystemPage.vue index 04943c9..fe45f40 100644 --- a/admin-frontend/src/pages/SystemPage.vue +++ b/admin-frontend/src/pages/SystemPage.vue @@ -261,7 +261,7 @@ onMounted(loadAll) -
同时进行截图的最大数量(每个浏览器约占用 200MB 内存)。
+
同时进行截图的最大数量(wkhtmltoimage 资源占用较低,可按需提高)。
diff --git a/api_browser.py b/api_browser.py index c059fff..e88cd34 100755 --- a/api_browser.py +++ b/api_browser.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- """ API 浏览器 - 用纯 HTTP 请求实现浏览功能 -比 Playwright 快 30-60 倍 +比传统浏览器自动化快 30-60 倍 """ import requests @@ -44,6 +44,27 @@ except Exception: _API_DIAGNOSTIC_SLOW_MS = max(0, _API_DIAGNOSTIC_SLOW_MS) _cookie_domain_fallback = urlsplit(BASE_URL).hostname or "postoa.aidunsoft.com" +_COOKIE_JAR_MAX_AGE_SECONDS = 24 * 60 * 60 + + +def get_cookie_jar_path(username: str) -> str: + """获取截图用的 cookies 文件路径(Netscape Cookie 格式)""" + import hashlib + + os.makedirs(COOKIES_DIR, exist_ok=True) + filename = hashlib.sha256(username.encode()).hexdigest()[:32] + ".cookies.txt" + return os.path.join(COOKIES_DIR, filename) + + +def is_cookie_jar_fresh(cookie_path: str, max_age_seconds: int = _COOKIE_JAR_MAX_AGE_SECONDS) -> bool: + """判断 cookies 文件是否存在且未过期""" + if not cookie_path or not os.path.exists(cookie_path): + return False + try: + file_age = time.time() - os.path.getmtime(cookie_path) + return file_age <= max(0, int(max_age_seconds or 0)) + except Exception: + return False _api_browser_instances: "weakref.WeakSet[APIBrowser]" = weakref.WeakSet() @@ -102,38 +123,37 @@ class APIBrowser: """记录日志""" if self.log_callback: self.log_callback(message) - def save_cookies_for_playwright(self, username: str): - """保存cookies供Playwright使用""" - import os - import json - import hashlib - - os.makedirs(COOKIES_DIR, exist_ok=True) - - # 安全修复:使用SHA256代替MD5作为文件名哈希 - filename = hashlib.sha256(username.encode()).hexdigest()[:32] + '.json' - cookies_path = os.path.join(COOKIES_DIR, filename) - + def save_cookies_for_screenshot(self, username: str): + """保存 cookies 供 wkhtmltoimage 使用(Netscape Cookie 格式)""" + cookies_path = get_cookie_jar_path(username) try: - # 获取requests session的cookies - cookies_list = [] + lines = [ + "# Netscape HTTP Cookie File", + "# This file was generated by zsglpt", + ] for cookie in self.session.cookies: - cookies_list.append({ - 'name': cookie.name, - 'value': cookie.value, - 'domain': cookie.domain or _cookie_domain_fallback, - 'path': cookie.path or '/', - }) - - # Playwright storage_state 格式 - storage_state = { - 'cookies': cookies_list, - 'origins': [] - } - - with open(cookies_path, 'w', encoding='utf-8') as f: - json.dump(storage_state, f) - + domain = cookie.domain or _cookie_domain_fallback + include_subdomains = "TRUE" if domain.startswith(".") else "FALSE" + path = cookie.path or "/" + secure = "TRUE" if getattr(cookie, "secure", False) else "FALSE" + expires = int(getattr(cookie, "expires", 0) or 0) + lines.append( + "\t".join( + [ + domain, + include_subdomains, + path, + secure, + str(expires), + cookie.name, + cookie.value, + ] + ) + ) + + with open(cookies_path, "w", encoding="utf-8") as f: + f.write("\n".join(lines) + "\n") + self.log(f"[API] Cookies已保存供截图使用") return True except Exception as e: diff --git a/app.py b/app.py index db1d816..7f124a9 100644 --- a/app.py +++ b/app.py @@ -33,7 +33,6 @@ from realtime.socketio_handlers import register_socketio_handlers from realtime.status_push import status_push_worker from routes import register_blueprints from security import init_security_middleware -from services.browser_manager import init_browser_manager from services.checkpoints import init_checkpoint_manager from services.maintenance import start_cleanup_scheduler from services.models import User @@ -199,7 +198,7 @@ def cleanup_on_exit(): except Exception: pass - logger.info("- 关闭浏览器线程池...") + logger.info("- 关闭截图线程池...") try: shutdown_browser_worker_pool() except Exception: @@ -278,15 +277,6 @@ if __name__ == "__main__": except Exception as e: logger.warning(f"警告: 加载并发配置失败,使用默认值: {e}") - logger.info("正在初始化浏览器管理器...") - try: - from services.browser_manager import init_browser_manager_async - - logger.info("启动浏览器环境初始化(后台进行,不阻塞服务启动)...") - init_browser_manager_async() - except Exception as e: - logger.warning(f"警告: 启动浏览器初始化失败: {e}") - logger.info("启动定时任务调度器...") threading.Thread(target=scheduled_task_worker, daemon=True, name="scheduled-task-worker").start() logger.info("✓ 定时任务调度器已启动") @@ -305,7 +295,7 @@ if __name__ == "__main__": except Exception: pool_size = 3 try: - logger.info(f"初始化截图线程池({pool_size}个worker,按需启动浏览器,空闲5分钟后自动关闭)...") + logger.info(f"初始化截图线程池({pool_size}个worker,按需启动执行环境,空闲5分钟后自动释放)...") init_browser_worker_pool(pool_size=pool_size) logger.info("✓ 截图线程池初始化完成") except Exception as e: diff --git a/browser_installer.py b/browser_installer.py deleted file mode 100755 index b940397..0000000 --- a/browser_installer.py +++ /dev/null @@ -1,214 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -浏览器自动下载安装模块 -检测本地是否有Playwright浏览器,如果没有则自动下载安装 -""" - -import os -import sys -import shutil -import subprocess -from pathlib import Path - -# 设置浏览器安装路径(支持Docker和本地环境) -# Docker环境: PLAYWRIGHT_BROWSERS_PATH环境变量已设置为 /ms-playwright -# 本地环境: 使用Playwright默认路径 -if 'PLAYWRIGHT_BROWSERS_PATH' in os.environ: - BROWSERS_PATH = os.environ['PLAYWRIGHT_BROWSERS_PATH'] -else: - # Windows: %USERPROFILE%\AppData\Local\ms-playwright - # Linux: ~/.cache/ms-playwright - if sys.platform == 'win32': - BROWSERS_PATH = str(Path.home() / "AppData" / "Local" / "ms-playwright") - else: - BROWSERS_PATH = str(Path.home() / ".cache" / "ms-playwright") - os.environ["PLAYWRIGHT_BROWSERS_PATH"] = BROWSERS_PATH - - -class BrowserInstaller: - """浏览器安装器""" - - def __init__(self, log_callback=None): - """ - 初始化安装器 - - Args: - log_callback: 日志回调函数 - """ - self.log_callback = log_callback - - def log(self, message): - """输出日志""" - if self.log_callback: - self.log_callback(message) - else: - try: - print(message) - except UnicodeEncodeError: - # 如果打印Unicode字符失败,替换特殊字符 - safe_message = message.replace('✓', '[OK]').replace('✗', '[X]') - print(safe_message) - - def check_playwright_installed(self): - """检查Playwright是否已安装""" - try: - import playwright - self.log("✓ Playwright已安装") - return True - except ImportError: - self.log("✗ Playwright未安装") - return False - - def check_chromium_installed(self): - """检查Chromium浏览器是否已安装""" - try: - from playwright.sync_api import sync_playwright - - # 尝试启动浏览器检查是否可用 - with sync_playwright() as p: - try: - # 使用超时快速检查 - browser = p.chromium.launch(headless=True, timeout=5000) - browser.close() - self.log("✓ Chromium浏览器已安装且可用") - return True - except Exception as e: - error_msg = str(e) - self.log(f"✗ Chromium浏览器不可用: {error_msg}") - - # 检查是否是路径不存在的错误 - if "Executable doesn't exist" in error_msg: - self.log("检测到浏览器文件缺失,需要重新安装") - - return False - except Exception as e: - self.log(f"✗ 检查浏览器时出错: {str(e)}") - return False - - def install_chromium(self): - """安装Chromium浏览器""" - try: - self.log("正在安装 Chromium 浏览器...") - - # 查找 playwright 可执行文件 - playwright_cli = None - possible_paths = [ - os.path.join(os.path.dirname(sys.executable), "Scripts", "playwright.exe"), - os.path.join(os.path.dirname(sys.executable), "playwright.exe"), - os.path.join(os.path.dirname(sys.executable), "Scripts", "playwright"), - os.path.join(os.path.dirname(sys.executable), "playwright"), - "playwright", # 系统PATH中 - ] - - for path in possible_paths: - if os.path.exists(path) or shutil.which(path): - playwright_cli = path - break - - # 如果找到了 playwright CLI,直接调用 - if playwright_cli: - self.log(f"使用 Playwright CLI: {playwright_cli}") - result = subprocess.run( - [playwright_cli, "install", "chromium"], - capture_output=True, - text=True, - timeout=300 - ) - else: - # 检测是否是 Nuitka 编译的程序 - is_nuitka = hasattr(sys, 'frozen') or '__compiled__' in globals() - - if is_nuitka: - self.log("检测到 Nuitka 编译环境") - self.log("✗ 无法找到 playwright CLI 工具") - self.log("请手动运行: playwright install chromium") - return False - else: - # 使用 python -m - result = subprocess.run( - [sys.executable, "-m", "playwright", "install", "chromium"], - capture_output=True, - text=True, - timeout=300 - ) - - if result.returncode == 0: - self.log("✓ Chromium浏览器安装成功") - return True - else: - self.log(f"✗ 浏览器安装失败: {result.stderr}") - return False - - except subprocess.TimeoutExpired: - self.log("✗ 浏览器安装超时") - return False - except Exception as e: - self.log(f"✗ 浏览器安装出错: {str(e)}") - return False - - def auto_install(self): - """ - 自动检测并安装所需环境 - - Returns: - 是否成功安装或已安装 - """ - self.log("=" * 60) - self.log("检查浏览器环境...") - self.log("=" * 60) - - # 1. 检查Playwright是否安装 - if not self.check_playwright_installed(): - self.log("✗ Playwright未安装,无法继续") - self.log("请确保程序包含 Playwright 库") - return False - - # 2. 检查Chromium浏览器是否安装 - if not self.check_chromium_installed(): - self.log("\n未检测到Chromium浏览器,开始自动安装...") - - # 安装浏览器 - if not self.install_chromium(): - self.log("✗ 浏览器安装失败") - self.log("\n您可以尝试以下方法:") - self.log("1. 手动执行: playwright install chromium") - self.log("2. 检查网络连接后重试") - self.log("3. 检查防火墙设置") - return False - - self.log("\n" + "=" * 60) - self.log("✓ 浏览器环境检查完成,一切就绪!") - self.log("=" * 60 + "\n") - - return True - - -def check_and_install_browser(log_callback=None): - """ - 便捷函数:检查并安装浏览器 - - Args: - log_callback: 日志回调函数 - - Returns: - 是否成功 - """ - installer = BrowserInstaller(log_callback) - return installer.auto_install() - - -# 测试代码 -if __name__ == "__main__": - print("浏览器自动安装工具") - print("=" * 60) - - installer = BrowserInstaller() - success = installer.auto_install() - - if success: - print("\n✓ 安装成功!您现在可以运行主程序了。") - else: - print("\n✗ 安装失败,请查看上方错误信息。") - - print("=" * 60) diff --git a/browser_pool_worker.py b/browser_pool_worker.py index 12e632d..d0dc0a5 100755 --- a/browser_pool_worker.py +++ b/browser_pool_worker.py @@ -1,42 +1,22 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -"""浏览器池管理 - 工作线程池模式(真正的浏览器复用)""" +"""截图线程池管理 - 工作线程池模式(并发执行截图任务)""" import os import threading import queue import time from typing import Callable, Optional, Dict, Any -import nest_asyncio - -_NEST_ASYNCIO_APPLIED = False -_NEST_ASYNCIO_LOCK = threading.Lock() - - -def _apply_nest_asyncio_once() -> None: - """按需应用 nest_asyncio,避免 import 时产生全局副作用。""" - global _NEST_ASYNCIO_APPLIED - - if _NEST_ASYNCIO_APPLIED: - return - with _NEST_ASYNCIO_LOCK: - if _NEST_ASYNCIO_APPLIED: - return - try: - nest_asyncio.apply() - except Exception: - pass - _NEST_ASYNCIO_APPLIED = True # 安全修复: 将魔法数字提取为可配置常量 BROWSER_IDLE_TIMEOUT = int(os.environ.get('BROWSER_IDLE_TIMEOUT', '300')) # 空闲超时(秒),默认5分钟 TASK_QUEUE_TIMEOUT = int(os.environ.get('TASK_QUEUE_TIMEOUT', '10')) # 队列获取超时(秒) TASK_QUEUE_MAXSIZE = int(os.environ.get('BROWSER_TASK_QUEUE_MAXSIZE', '200')) # 队列最大长度(0表示无限制) -BROWSER_MAX_USE_COUNT = int(os.environ.get('BROWSER_MAX_USE_COUNT', '0')) # 每个浏览器最大复用次数(0表示不限制) +BROWSER_MAX_USE_COUNT = int(os.environ.get('BROWSER_MAX_USE_COUNT', '0')) # 每个执行环境最大复用次数(0表示不限制) class BrowserWorker(threading.Thread): - """浏览器工作线程 - 每个worker维护自己的浏览器""" + """截图工作线程 - 每个worker维护自己的执行环境""" def __init__( self, @@ -62,82 +42,44 @@ class BrowserWorker(threading.Thread): if self.log_callback: self.log_callback(f"[Worker-{self.worker_id}] {message}") else: - print(f"[浏览器池][Worker-{self.worker_id}] {message}") + print(f"[截图池][Worker-{self.worker_id}] {message}") def _create_browser(self): - """创建浏览器实例""" - try: - from playwright.sync_api import sync_playwright + """创建截图执行环境(逻辑占位,无需真实浏览器)""" + created_at = time.time() + self.browser_instance = { + 'created_at': created_at, + 'use_count': 0, + 'worker_id': self.worker_id, + } + self.last_activity_ts = created_at + self.log("截图执行环境就绪") + return True - self.log("正在创建浏览器...") - playwright = sync_playwright().start() - browser = playwright.chromium.launch( - headless=True, - args=[ - '--no-sandbox', - '--disable-setuid-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - ] - ) + def _close_browser(self): + """关闭截图执行环境""" + if self.browser_instance: + self.log(f"执行环境已释放(共处理{self.browser_instance.get('use_count', 0)}个任务)") + self.browser_instance = None - created_at = time.time() - self.browser_instance = { - 'playwright': playwright, - 'browser': browser, - 'created_at': created_at, - 'use_count': 0, - 'worker_id': self.worker_id - } - self.last_activity_ts = created_at - self.log(f"浏览器创建成功") + def _check_browser_health(self) -> bool: + """检查执行环境是否就绪""" + return bool(self.browser_instance) + + def _ensure_browser(self) -> bool: + """确保执行环境可用""" + if self._check_browser_health(): return True - - except Exception as e: - self.log(f"创建浏览器失败: {e}") - return False - - def _close_browser(self): - """关闭浏览器""" - if self.browser_instance: - try: - self.log("正在关闭浏览器...") - if self.browser_instance['browser']: - self.browser_instance['browser'].close() - if self.browser_instance['playwright']: - self.browser_instance['playwright'].stop() - self.log(f"浏览器已关闭(共处理{self.browser_instance['use_count']}个任务)") - except Exception as e: - self.log(f"关闭浏览器时出错: {e}") - finally: - self.browser_instance = None - - def _check_browser_health(self) -> bool: - """检查浏览器是否健康""" - if not self.browser_instance: - return False - - try: - return self.browser_instance['browser'].is_connected() - except: - return False - - def _ensure_browser(self) -> bool: - """确保浏览器可用(如果不可用则重新创建)""" - if self._check_browser_health(): - return True - - # 浏览器不可用,尝试重新创建 - self.log("浏览器不可用,尝试重新创建...") - self._close_browser() - return self._create_browser() + self.log("执行环境不可用,尝试重新创建...") + self._close_browser() + return self._create_browser() def run(self): - """工作线程主循环 - 按需启动浏览器模式""" + """工作线程主循环 - 按需启动执行环境模式""" if self.pre_warm: - self.log("Worker启动(预热模式,启动即创建浏览器)") + self.log("Worker启动(预热模式,启动即准备执行环境)") else: - self.log("Worker启动(按需模式,等待任务时不占用浏览器资源)") + self.log("Worker启动(按需模式,等待任务时不占用资源)") if self.pre_warm and not self.browser_instance: self._create_browser() @@ -155,11 +97,11 @@ class BrowserWorker(threading.Thread): try: task = self.task_queue.get(timeout=TASK_QUEUE_TIMEOUT) except queue.Empty: - # 检查是否需要关闭空闲的浏览器 + # 检查是否需要释放空闲的执行环境 if self.browser_instance and self.last_activity_ts > 0: idle_time = time.time() - self.last_activity_ts if idle_time > BROWSER_IDLE_TIMEOUT: - self.log(f"空闲{int(idle_time)}秒,关闭浏览器释放资源") + self.log(f"空闲{int(idle_time)}秒,释放执行环境") self._close_browser() continue @@ -169,14 +111,14 @@ class BrowserWorker(threading.Thread): self.log("收到停止信号") break - # 按需创建或确保浏览器可用 + # 按需创建或确保执行环境可用 browser_ready = False for attempt in range(2): if self._ensure_browser(): browser_ready = True break if attempt < 1: - self.log("浏览器创建失败,重试...") + self.log("执行环境创建失败,重试...") time.sleep(0.5) if not browser_ready: @@ -185,20 +127,20 @@ class BrowserWorker(threading.Thread): task["retry_count"] = retry_count + 1 try: self.task_queue.put(task, timeout=1) - self.log("浏览器不可用,任务重新入队") + self.log("执行环境不可用,任务重新入队") except queue.Full: self.log("任务队列已满,无法重新入队,任务失败") callback = task.get("callback") if callable(callback): - callback(None, "浏览器不可用") + callback(None, "执行环境不可用") self.total_tasks += 1 self.failed_tasks += 1 continue - self.log("浏览器不可用,任务失败") + self.log("执行环境不可用,任务失败") callback = task.get("callback") if isinstance(task, dict) else None if callable(callback): - callback(None, "浏览器不可用") + callback(None, "执行环境不可用") self.total_tasks += 1 self.failed_tasks += 1 continue @@ -212,10 +154,10 @@ class BrowserWorker(threading.Thread): self.total_tasks += 1 self.browser_instance['use_count'] += 1 - self.log(f"开始执行任务(第{self.browser_instance['use_count']}次使用浏览器)") + self.log(f"开始执行任务(第{self.browser_instance['use_count']}次执行)") try: - # 将浏览器实例传递给任务函数 + # 将执行环境实例传递给任务函数 result = task_func(self.browser_instance, *task_args, **task_kwargs) callback(result, None) self.log(f"任务执行成功") @@ -227,15 +169,15 @@ class BrowserWorker(threading.Thread): self.failed_tasks += 1 self.last_activity_ts = time.time() - # 任务失败后,检查浏览器健康 + # 任务失败后,检查执行环境健康 if not self._check_browser_health(): - self.log("任务失败导致浏览器异常,将在下次任务前重建") + self.log("任务失败导致执行环境异常,将在下次任务前重建") self._close_browser() - # 定期重启浏览器,释放Chromium可能累积的内存 + # 定期重启执行环境,释放可能累积的资源 if self.browser_instance and BROWSER_MAX_USE_COUNT > 0: if self.browser_instance.get('use_count', 0) >= BROWSER_MAX_USE_COUNT: - self.log(f"浏览器已复用{self.browser_instance['use_count']}次,重启释放资源") + self.log(f"执行环境已复用{self.browser_instance['use_count']}次,重启释放资源") self._close_browser() except Exception as e: @@ -252,7 +194,7 @@ class BrowserWorker(threading.Thread): class BrowserWorkerPool: - """浏览器工作线程池""" + """截图工作线程池""" def __init__(self, pool_size: int = 3, log_callback: Optional[Callable] = None): self.pool_size = pool_size @@ -265,20 +207,18 @@ class BrowserWorkerPool: def log(self, message: str): """日志输出""" - if self.log_callback: - self.log_callback(message) - else: - print(f"[浏览器池] {message}") + if self.log_callback: + self.log_callback(message) + else: + print(f"[截图池] {message}") def initialize(self): - """初始化工作线程池(按需模式,默认预热1个浏览器)""" + """初始化工作线程池(按需模式,默认预热1个执行环境)""" with self.lock: if self.initialized: return - _apply_nest_asyncio_once() - - self.log(f"正在初始化工作线程池({self.pool_size}个worker,按需启动浏览器)...") + self.log(f"正在初始化截图线程池({self.pool_size}个worker,按需启动执行环境)...") for i in range(self.pool_size): worker = BrowserWorker( @@ -291,13 +231,13 @@ class BrowserWorkerPool: self.workers.append(worker) self.initialized = True - self.log(f"✓ 工作线程池初始化完成({self.pool_size}个worker就绪,浏览器将在有任务时按需启动)") + self.log(f"✓ 截图线程池初始化完成({self.pool_size}个worker就绪,执行环境将在有任务时按需启动)") - # 初始化完成后,默认预热1个浏览器,降低容器重启后前几批任务的冷启动开销 + # 初始化完成后,默认预热1个执行环境,降低容器重启后前几批任务的冷启动开销 self.warmup(1) def warmup(self, count: int = 1) -> int: - """预热浏览器池 - 预创建指定数量的浏览器""" + """预热截图线程池 - 预创建指定数量的执行环境""" if count <= 0: return 0 @@ -308,7 +248,7 @@ class BrowserWorkerPool: with self.lock: target_workers = list(self.workers[: min(count, len(self.workers))]) - self.log(f"预热浏览器池(预创建{len(target_workers)}个浏览器)...") + self.log(f"预热截图线程池(预创建{len(target_workers)}个执行环境)...") for worker in target_workers: if not worker.browser_instance: @@ -323,7 +263,7 @@ class BrowserWorkerPool: time.sleep(0.1) warmed = sum(1 for w in target_workers if w.browser_instance) - self.log(f"✓ 浏览器池预热完成({warmed}个浏览器就绪)") + self.log(f"✓ 截图线程池预热完成({warmed}个执行环境就绪)") return warmed def submit_task(self, task_func: Callable, callback: Callable, *args, **kwargs) -> bool: @@ -434,8 +374,8 @@ _global_pool: Optional[BrowserWorkerPool] = None _pool_lock = threading.Lock() -def get_browser_worker_pool(pool_size: int = 3, log_callback: Optional[Callable] = None) -> BrowserWorkerPool: - """获取全局浏览器工作线程池(单例)""" +def get_browser_worker_pool(pool_size: int = 3, log_callback: Optional[Callable] = None) -> BrowserWorkerPool: + """获取全局截图工作线程池(单例)""" global _global_pool with _pool_lock: @@ -446,14 +386,48 @@ def get_browser_worker_pool(pool_size: int = 3, log_callback: Optional[Callable] return _global_pool -def init_browser_worker_pool(pool_size: int = 3, log_callback: Optional[Callable] = None): - """初始化全局浏览器工作线程池""" - get_browser_worker_pool(pool_size=pool_size, log_callback=log_callback) - - -def shutdown_browser_worker_pool(): - """关闭全局浏览器工作线程池""" - global _global_pool +def init_browser_worker_pool(pool_size: int = 3, log_callback: Optional[Callable] = None): + """初始化全局截图工作线程池""" + get_browser_worker_pool(pool_size=pool_size, log_callback=log_callback) + + +def _shutdown_pool_when_idle(pool: BrowserWorkerPool) -> None: + try: + pool.wait_for_completion(timeout=60) + except Exception: + pass + try: + pool.shutdown() + except Exception: + pass + + +def resize_browser_worker_pool(pool_size: int, log_callback: Optional[Callable] = None) -> bool: + """调整截图线程池并发(新任务走新池,旧池空闲后自动关闭)""" + global _global_pool + + try: + target_size = max(1, int(pool_size)) + except Exception: + target_size = 1 + + with _pool_lock: + old_pool = _global_pool + if old_pool and int(getattr(old_pool, "pool_size", 0) or 0) == target_size: + return False + effective_log_callback = log_callback or (getattr(old_pool, "log_callback", None) if old_pool else None) + _global_pool = BrowserWorkerPool(pool_size=target_size, log_callback=effective_log_callback) + _global_pool.initialize() + + if old_pool: + threading.Thread(target=_shutdown_pool_when_idle, args=(old_pool,), daemon=True).start() + + return True + + +def shutdown_browser_worker_pool(): + """关闭全局截图工作线程池""" + global _global_pool with _pool_lock: if _global_pool: @@ -461,9 +435,9 @@ def shutdown_browser_worker_pool(): _global_pool = None -if __name__ == '__main__': - # 测试代码 - print("测试浏览器工作线程池...") +if __name__ == '__main__': + # 测试代码 + print("测试截图工作线程池...") def test_task(browser_instance, url: str, task_id: int): """测试任务:访问URL""" @@ -478,8 +452,8 @@ if __name__ == '__main__': else: print(f"任务成功: {result}") - # 创建线程池(2个worker) - pool = BrowserWorkerPool(pool_size=2) + # 创建线程池(2个worker) + pool = BrowserWorkerPool(pool_size=2) pool.initialize() # 提交4个任务 diff --git a/docker-compose.yml b/docker-compose.yml index 8b8b550..4fd2379 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,7 +10,6 @@ services: - ./data:/app/data # 数据库持久化 - ./logs:/app/logs # 日志持久化 - ./截图:/app/截图 # 截图持久化 - - ./playwright:/ms-playwright # Playwright浏览器持久化(避免重复下载) - /etc/localtime:/etc/localtime:ro # 时区同步 - ./static:/app/static # 静态文件(实时更新) - ./templates:/app/templates # 模板文件(实时更新) @@ -23,8 +22,6 @@ services: environment: - TZ=Asia/Shanghai - PYTHONUNBUFFERED=1 - - PLAYWRIGHT_BROWSERS_PATH=/ms-playwright - - PLAYWRIGHT_DOWNLOAD_HOST=https://npmmirror.com/mirrors/playwright # Flask 配置 - FLASK_ENV=production - FLASK_DEBUG=false diff --git a/playwright_automation.py b/playwright_automation.py deleted file mode 100755 index 7c2d9d5..0000000 --- a/playwright_automation.py +++ /dev/null @@ -1,1585 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -""" -Playwright版本 - 知识管理系统自动化核心 -使用浏览器上下文(Context)实现高性能并发 -""" - -import os -from pathlib import Path -from playwright.sync_api import sync_playwright, Browser, BrowserContext, Page, Playwright -import time -import json -import threading -import atexit -import weakref -from typing import Optional, Callable -from dataclasses import dataclass -from urllib.parse import urlsplit, urlunsplit -from app_config import get_config - -# 设置浏览器安装路径(优先使用环境变量,否则使用默认路径) -if 'PLAYWRIGHT_BROWSERS_PATH' not in os.environ: - # 本地开发环境,使用Windows默认路径 - BROWSERS_PATH = str(Path.home() / "AppData" / "Local" / "ms-playwright") - os.environ["PLAYWRIGHT_BROWSERS_PATH"] = BROWSERS_PATH -else: - # Docker环境,使用已设置的环境变量 - BROWSERS_PATH = os.environ["PLAYWRIGHT_BROWSERS_PATH"] - -# 获取配置 -config = get_config() - -_playwright_automation_instances: "weakref.WeakSet[PlaywrightAutomation]" = weakref.WeakSet() - - -def _cleanup_playwright_automation_instances(): - """进程退出时清理残留的自动化实例(弱引用,不阻止GC)""" - for inst in list(_playwright_automation_instances): - try: - inst._force_cleanup() - except Exception: - pass - - -atexit.register(_cleanup_playwright_automation_instances) - - -@dataclass -class BrowseResult: - """浏览结果""" - success: bool - total_items: int = 0 - total_attachments: int = 0 - error_message: str = "" - - -class PlaywrightBrowserManager: - """Playwright浏览器管理器 - 每个账号独立的浏览器实例""" - - def __init__(self, headless: bool = True, log_callback: Optional[Callable] = None): - """ - 初始化浏览器管理器 - - Args: - headless: 是否使用无头模式 - log_callback: 日志回调函数,签名: log_callback(message, account_id=None) - """ - self.headless = headless - self.log_callback = log_callback - self._lock = threading.Lock() - - def log(self, message: str, account_id: Optional[str] = None): - """记录日志""" - if self.log_callback: - self.log_callback(message, account_id) - - def create_browser(self, proxy_config=None): - """创建新的独立浏览器实例(每个账号独立)""" - try: - # self.log("初始化Playwright实例...") # 精简日志 - playwright = sync_playwright().start() - - # self.log("启动独立浏览器进程...") # 精简日志 - start_time = time.time() - - # 准备浏览器启动参数 - launch_options = { - 'headless': self.headless, - 'args': [ - '--no-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - '--disable-extensions', - '--disable-notifications', - '--disable-infobars', - '--disable-default-apps', - '--disable-background-timer-throttling', - '--disable-backgrounding-occluded-windows', - '--disable-renderer-backgrounding', - ] - } - - # 如果有代理配置,添加代理 - if proxy_config and proxy_config.get('server'): - launch_options['proxy'] = { - 'server': proxy_config['server'] - } - self.log(f"使用代理: {proxy_config['server']}") - - browser = playwright.chromium.launch(**launch_options) - - elapsed = time.time() - start_time - # self.log(f"独立浏览器启动成功") # 精简日志 - - return playwright, browser - - except Exception as e: - self.log(f"启动浏览器失败: {str(e)}") - raise - - def create_browser_and_context(self, proxy_config=None, storage_state=None): - """创建独立的浏览器和上下文(每个账号完全隔离)""" - playwright, browser = self.create_browser(proxy_config) - - start_time = time.time() - # self.log("创建浏览器上下文...") # 精���日志 - - context_options = { - 'viewport': {'width': 1920, 'height': 1080}, - 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', - 'device_scale_factor': 2, # 2倍设备像素比,提高文字清晰度 - } - if storage_state: - context_options['storage_state'] = storage_state - - context = browser.new_context(**context_options) - - # 设置默认超时 - context.set_default_timeout(config.DEFAULT_TIMEOUT) - context.set_default_navigation_timeout(config.PAGE_LOAD_TIMEOUT) - - elapsed = time.time() - start_time - # self.log(f"上下文创建完成") # 精简日志 - - return playwright, browser, context - - -class PlaywrightAutomation: - """Playwright自动化操作类""" - - def __init__(self, browser_manager: PlaywrightBrowserManager, account_id: str, proxy_config: Optional[dict] = None): - """ - 初始化自动化操作 - - Args: - browser_manager: 浏览器管理器 - account_id: 账号ID(用于日志) - """ - self.browser_manager = browser_manager - self.account_id = account_id - self.proxy_config = proxy_config - self.playwright: Optional[Playwright] = None - self.browser: Optional[Browser] = None - self.context: Optional[BrowserContext] = None - self.page: Optional[Page] = None - self.main_page: Optional[Page] = None - self._closed = False # 防止重复关闭 - self._lock = threading.Lock() # Bug #13 fix: 保护浏览器资源访问 - - _playwright_automation_instances.add(self) - - def log(self, message: str): - """记录日志""" - self.browser_manager.log(message, self.account_id) - - - def get_cookies_path(self, username: str) -> str: - """获取cookies文件路径""" - import os - cookies_dir = getattr(config, "COOKIES_DIR", "/app/data/cookies") - os.makedirs(cookies_dir, exist_ok=True) - # 安全修复:使用SHA256代替MD5作为文件名哈希 - import hashlib - filename = hashlib.sha256(username.encode()).hexdigest()[:32] + '.json' - return os.path.join(cookies_dir, filename) - - def save_cookies(self, username: str): - """保存当前会话的cookies""" - try: - if self.context: - storage = self.context.storage_state() - cookies_path = self.get_cookies_path(username) - with open(cookies_path, 'w', encoding='utf-8') as f: - json.dump(storage, f) - self.log(f"Cookies已保存") - return True - except Exception as e: - self.log(f"保存cookies失败: {e}") - return False - - def load_cookies(self, username: str) -> bool: - """加载已保存的cookies""" - import os - cookies_path = self.get_cookies_path(username) - if not os.path.exists(cookies_path): - return False - - try: - # 检查cookies文件是否过期(24小时) - import time as time_module - file_age = time_module.time() - os.path.getmtime(cookies_path) - if file_age > 24 * 3600: # 24小时 - self.log(f"Cookies已过期,需要重新登录") - os.remove(cookies_path) - return False - - with open(cookies_path, 'r', encoding='utf-8') as f: - storage = json.load(f) - - # 创建带cookies的浏览器上下文 - self.playwright, self.browser, self.context = self.browser_manager.create_browser_and_context( - self.proxy_config, - storage_state=storage - ) - self.page = self.context.new_page() - self.main_page = self.page - return True - except Exception as e: - self.log(f"加载cookies失败: {e}") - return False - - def load_cookies_into_current_browser(self, username: str) -> bool: - """在“已连接的现有 browser”上加载 cookies 创建 context(用于浏览器池复用)""" - import os - if not self.browser or not self.browser.is_connected(): - return False - - cookies_path = self.get_cookies_path(username) - if not os.path.exists(cookies_path): - return False - - try: - # 检查cookies文件是否过期(24小时) - import time as time_module - file_age = time_module.time() - os.path.getmtime(cookies_path) - if file_age > 24 * 3600: - self.log("Cookies已过期,需要重新登录") - os.remove(cookies_path) - return False - - with open(cookies_path, 'r', encoding='utf-8') as f: - storage = json.load(f) - - context_options = { - 'viewport': {'width': 1920, 'height': 1080}, - 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', - 'device_scale_factor': 2, - 'storage_state': storage - } - self.context = self.browser.new_context(**context_options) - self.context.set_default_timeout(config.DEFAULT_TIMEOUT) - self.context.set_default_navigation_timeout(config.PAGE_LOAD_TIMEOUT) - self.page = self.context.new_page() - self.main_page = self.page - return True - except Exception as e: - self.log(f"加载cookies失败: {e}") - try: - if self.context: - self.context.close() - except Exception: - pass - self.context = None - self.page = None - self.main_page = None - return False - - def check_login_state(self) -> bool: - """检查当前是否处于登录状态""" - try: - index_url = getattr(config, "ZSGL_INDEX_URL", None) - if not index_url: - login_url = getattr(config, "ZSGL_LOGIN_URL", "https://postoa.aidunsoft.com/admin/login.aspx") - index_filename = getattr(config, "ZSGL_INDEX_URL_PATTERN", "index.aspx") - parsed = urlsplit(login_url) - if parsed.scheme and parsed.netloc: - path = parsed.path or "/" - if path.endswith("/"): - path = path + index_filename - else: - path = path.rsplit("/", 1)[0] + "/" + index_filename - index_url = urlunsplit((parsed.scheme, parsed.netloc, path, "", "")) - else: - index_url = "https://postoa.aidunsoft.com/admin/index.aspx" - - # 访问首页检查是否跳转到登录页 - self.page.goto(index_url, timeout=15000) - self.page.wait_for_load_state('networkidle', timeout=10000) - current_url = self.page.url - # 如果还在index页面,说明登录态有效 - if getattr(config, "ZSGL_INDEX_URL_PATTERN", "index.aspx") in current_url: - return True - return False - except (TimeoutError, Exception) as e: - # 安全修复: 记录异常信息便于调试,但不重新抛出SystemExit/KeyboardInterrupt - if isinstance(e, (SystemExit, KeyboardInterrupt)): - raise - return False - - def quick_login(self, username: str, password: str, remember: bool = True): - """快速登录 - 使用池中浏览器时直接登录,否则尝试cookies""" - # 如果已有浏览器实例(从池中获取),优先尝试复用cookies(避免重复登录/减少耗时) - if self.browser and self.browser.is_connected(): - if self.load_cookies_into_current_browser(username): - self.log("使用池中浏览器,尝试使用已保存的登录态...") - if self.check_login_state(): - self.log("✓ 登录态有效,跳过登录") - return {"success": True, "message": "使用已保存的登录态", "used_cookies": True} - else: - self.log("登录态已失效,重新登录") - try: - if self.context: - self.context.close() - except Exception: - pass - self.context = None - self.page = None - self.main_page = None - - self.log("使用池中浏览器,直接登录") - result = self.login(username, password, remember) - if result.get('success'): - self.save_cookies(username) - result['used_cookies'] = False - return result - - # 无现有浏览器时,尝试使用cookies - if self.load_cookies(username): - self.log(f"尝试使用已保存的登录态...") - if self.check_login_state(): - self.log(f"✓ 登录态有效,跳过登录") - return {"success": True, "message": "使用已保存的登录态", "used_cookies": True} - else: - self.log(f"登录态已失效,重新登录") - # 关闭当前context,重新登录 - try: - if self.context: - self.context.close() - if self.browser: - self.browser.close() - if self.playwright: - self.playwright.stop() - except Exception: - pass # 清理时忽略错误 - - # 正常登录 - result = self.login(username, password, remember) - - # 登录成功后保存cookies - if result.get('success'): - self.save_cookies(username) - result['used_cookies'] = False - - return result - - - def login(self, username: str, password: str, remember: bool = True) -> bool: - """ - 登录系统 - - Args: - username: 用户名 - password: 密码 - remember: 是否记住密码 - - Returns: - 是否登录成功 - """ - try: - start_time = time.time() - - # 如果已有浏览器实例(从浏览器池获取),只创建context - if self.browser and self.browser.is_connected(): - self.log("使用池中浏览器...") - context_options = { - 'viewport': {'width': 1920, 'height': 1080}, - 'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', - 'device_scale_factor': 2, - } - self.context = self.browser.new_context(**context_options) - else: - # 创建新的浏览器和上下文 - self.playwright, self.browser, self.context = self.browser_manager.create_browser_and_context(self.proxy_config) - elapsed = time.time() - start_time - # self.log("浏览器就绪") # 精简日志 - - # self.log("创建页面...") # 精简日志 - self.page = self.context.new_page() - self.main_page = self.page - - # self.log("访问登录页面...") # 精简日志 - # 使用重试机制处理超时 - max_retries = 2 - for attempt in range(max_retries): - try: - self.page.goto(config.ZSGL_LOGIN_URL, timeout=60000) - break - except Exception as e: - if attempt < max_retries - 1: - self.log(f"页面加载超时,重试中... ({attempt + 1}/{max_retries})") - time.sleep(2) - else: - raise - - # self.log("填写登录信息...") # 精简日志 - self.page.fill('#txtUserName', username) - self.page.fill('#txtPassword', password) - - if remember: - self.page.check('#chkRemember') - - # self.log("点击登录按钮...") # 精简日志 - self.page.click('#btnSubmit') - - # 等待跳转 - # self.log("等待登录处理...") # 精简日志 - self.page.wait_for_load_state('networkidle', timeout=10000) # 优化为10秒 - - # 检查登录结果 - current_url = self.page.url - self.log(f"当前URL: {current_url}") - - if config.ZSGL_INDEX_URL_PATTERN in current_url: - self.log("登录成功!") - return {"success": True, "error_type": None, "message": "登录成功"} - else: - # 检查是否显示了错误提示 - error_message = "登录失败" - error_type = "unknown" # 默认为未知错误,不是密码错误 - - try: - # 尝试获取页面上的错误提示 - error_element = self.page.locator('#lblMsg, .error-message, [class*="error"]').first - if error_element.is_visible(timeout=2000): - error_text = error_element.inner_text().strip() - if error_text: - error_message = error_text - self.log(f"登录错误提示: {error_text}") - # 只有明确提示密码错误时才标记为密码错误 - if "密码" in error_text or "password" in error_text.lower() or "用户名" in error_text or "账号" in error_text: - error_type = "password_error" - else: - error_type = "login_error" - except Exception: - pass # 获取错误提示失败时忽略 - - # 如果没有明确的错误提示,可能是网络问题,不认为是密码错误 - if error_type == "unknown": - error_message = "登录失败,可能是网络问题或页面加载超时" - error_type = "network_error" - - self.log(error_message) - return {"success": False, "error_type": error_type, "message": error_message} - - except Exception as e: - error_msg = str(e) - self.log(f"登录过程中出错: {error_msg}") - return {"success": False, "error_type": "exception", "message": error_msg} - - def is_context_error(self, error_msg: str) -> bool: - """检查是否是上下文/导航相关错误""" - error_keywords = [ - "Frame was detached", - "Execution context was destroyed", - "navigation", - "detached", - "Target closed", - "Session closed", - "Connection closed" - ] - error_lower = error_msg.lower() - return any(keyword.lower() in error_lower for keyword in error_keywords) - - def safe_execute(self, action, description="操作", max_retries=3, recover_browse_type=None): - """安全执行操作,自动处理上下文销毁等错误 - - Args: - action: 要执行的函数 - description: 操作描述(用于日志) - max_retries: 最大重试次数 - recover_browse_type: 恢复时需要重新点击的浏览类型 - - Returns: - (success, result) 元组 - """ - last_error = None - for attempt in range(max_retries): - try: - result = action() - return True, result - except Exception as e: - last_error = str(e) - if self.is_context_error(last_error): - if attempt < max_retries - 1: - self.log(f"⚠ {description}时上下文失效,尝试恢复... ({attempt+1}/{max_retries})") - time.sleep(1 + attempt * 0.5) - # 尝试恢复iframe - if self.recover_iframe(recover_browse_type): - continue - else: - self.log(f" iframe恢复失败,继续重试...") - else: - self.log(f"✗ {description}失败,已重试{max_retries}次: {last_error}") - else: - # 非上下文错误,直接返回失败 - self.log(f"✗ {description}失败: {last_error}") - return False, None - - return False, None - - def get_iframe_safe(self, retry=True, max_retries=5): - """安全地获取iframe,带重试机制 - - Args: - retry: 是否启用重试 - max_retries: 最大重试次数 - """ - for attempt in range(max_retries if retry else 1): - try: - # Bug #13 fix: 使用锁保护main_page访问 - with self._lock: - # 先检查main_page是否有效 - if not self.main_page: - self.log("⚠ main_page无效") - return None - - iframe = self.main_page.frame('mainframe') - if iframe: - return iframe - except Exception as e: - error_msg = str(e) - if self.is_context_error(error_msg): - self.log(f"⚠ 获取iframe时上下文失效,等待恢复... ({attempt+1}/{max_retries})") - else: - self.log(f"⚠ 获取iframe出错: {error_msg}") - - if attempt < max_retries - 1: - time.sleep(0.5 + attempt * 0.3) # 递增等待时间 - - return None - - def recover_iframe(self, browse_type: str = None) -> bool: - """尝试恢复iframe连接 - - 当遇到 Frame was detached / Execution context was destroyed 错误时调用此函数 - 采用多级恢复策略,逐步升级恢复力度 - """ - self.log("🔄 尝试恢复iframe连接...") - - # 方法1: 直接尝试获取iframe(最快,适用于短暂的上下文切换) - self.page = self.get_iframe_safe(retry=True, max_retries=3) - if self.page: - self.log("✓ iframe恢复成功(直接获取)") - return True - - # 方法2: 等待页面稳定后重试(适用于页面正在加载的情况) - self.log(" 等待页面稳定...") - time.sleep(1.5) - try: - self.main_page.wait_for_load_state('domcontentloaded', timeout=5000) - except Exception: # Bug fix: 明确捕获Exception - pass - try: - self.main_page.wait_for_load_state('networkidle', timeout=10000) - except Exception: # Bug fix: 明确捕获Exception - pass - - self.page = self.get_iframe_safe(retry=True, max_retries=3) - if self.page: - self.log("✓ iframe恢复成功(等待后获取)") - return True - - # 方法3: 使用JavaScript强制等待并获取iframe - self.log(" 尝试JavaScript方式获取iframe...") - try: - # 等待iframe存在 - self.main_page.wait_for_selector("iframe[name='mainframe']", timeout=5000) - # 使用evaluate确保iframe可用 - has_iframe = self.main_page.evaluate("""() => { - const iframe = document.querySelector('iframe[name="mainframe"]'); - return iframe && iframe.contentWindow && iframe.contentDocument; - }""") - if has_iframe: - time.sleep(0.5) - self.page = self.get_iframe_safe(retry=True, max_retries=3) - if self.page: - self.log("✓ iframe恢复成功(JavaScript验证后获取)") - return True - except Exception as e: - self.log(f" JavaScript方式失败: {str(e)[:50]}") - - # 方法4: 刷新页面并重新切换(最后手段) - self.log(" 刷新页面重试...") - try: - self.main_page.reload(wait_until='domcontentloaded') - time.sleep(2) - - # 等待iframe出现 - self.main_page.wait_for_selector("iframe[name='mainframe']", timeout=15000) - time.sleep(1) - - self.page = self.get_iframe_safe(retry=True, max_retries=5) - if self.page: - # 如果有浏览类型,重新点击 - if browse_type: - self.log(f" 重新点击'{browse_type}'...") - selector = f"//div[contains(@class, 'rule-multi-radio')]//a[contains(text(), '{browse_type}')]" - try: - self.page.locator(selector).click(timeout=5000) - time.sleep(1.5) - # 等待表格加载 - try: - self.page.locator("//table[@class='ltable']").wait_for(timeout=10000) - except Exception: # Bug fix: 明确捕获Exception - pass - self.log(f"✓ iframe恢复成功(刷新后重新点击'{browse_type}')") - except Exception: # Bug fix: 明确捕获Exception - # 尝试点击label - try: - label_selector = f"//label[contains(text(), '{browse_type}')]" - self.page.locator(label_selector).click(timeout=5000) - time.sleep(1.5) - self.log(f"✓ iframe恢复成功(刷新后点击label)") - except Exception as label_e: - self.log(f" 点击label也失败: {str(label_e)[:30]}") - return False - else: - self.log("✓ iframe恢复成功(刷新后获取)") - return True - except Exception as e: - self.log(f"✗ 刷新恢复失败: {str(e)[:50]}") - - self.log("✗ iframe恢复失败,所有方法都已尝试") - return False - - def switch_to_iframe(self) -> bool: - """切换到mainframe iframe""" - try: - # self.log("查找并切换到iframe...") # 精简日志 - - # 使用Playwright的等待机制 - max_retries = 3 - for i in range(max_retries): - try: - # 等待iframe元素出现 - self.main_page.wait_for_selector("iframe[name='mainframe']", timeout=2000) - - # 获取iframe - iframe = self.get_iframe_safe() - if iframe: - self.page = iframe - self.log(f"✓ 成功切换到iframe (尝试 {i+1}/{max_retries})") - return True - except Exception as e: - if i < max_retries - 1: - self.log(f"未找到iframe,重试中... ({i+1}/{max_retries})") - time.sleep(1) - else: - self.log(f"所有重试都失败,未找到iframe") - - return False - - except Exception as e: - self.log(f"切换到iframe时出错: {str(e)}") - return False - - def safe_click(self, locator, timeout=5000, description="元素"): - """安全地点击元素,捕获导航异常""" - try: - locator.click(timeout=timeout) - return True - except Exception as e: - error_msg = str(e) - if "Execution context was destroyed" in error_msg or "navigation" in error_msg.lower(): - self.log(f"⚠ 点击{description}时检测到页面导航,等待页面稳定...") - time.sleep(1) - return True # 虽然有异常,但导航成功,返回True - else: - self.log(f"点击{description}失败: {error_msg}") - return False - - def switch_browse_type(self, browse_type: str, max_retries: int = 2) -> bool: - """ - 切换浏览类型(带重试机制) - - Args: - browse_type: 浏览类型(注册前未读/应读) - max_retries: 最大重试次数(默认2次) - - Returns: - 是否切换成功 - """ - for attempt in range(max_retries + 1): - try: - if attempt > 0: - self.log(f"⚠ 第 {attempt + 1} 次尝试切换浏览类型...") - else: - self.log(f"切换到'{browse_type}'类型...") - - # 切换到iframe - if not self.switch_to_iframe(): - if attempt < max_retries: - self.log(f"iframe切换失败,等待1秒后重试...") - time.sleep(1) - continue - return False - - # 方法1: 尝试查找标签(如果JavaScript创建了的话) - selector = f"//div[contains(@class, 'rule-multi-radio')]//a[contains(text(), '{browse_type}')]" - - try: - # 等待并点击 - self.page.locator(selector).click(timeout=5000) - self.log(f"点击'{browse_type}'按钮成功") - - # 等待页面刷新并加载内容 - time.sleep(1.5) - - # 等待表格加载(最多等待30秒) - try: - self.page.locator("//table[@class='ltable']").wait_for(timeout=30000) - self.log("内容表格已加载") - except Exception as e: - self.log("等待表格加载超时,继续...") - - return True - except Exception as e: - error_msg = str(e) - if "Execution context was destroyed" in error_msg: - self.log(f"⚠ 检测到执行上下文被销毁") - if attempt < max_retries: - self.log(f"等待2秒后重试...") - time.sleep(2) - continue - self.log(f"未找到标签,尝试点击