Files
zsglpt/playwright_automation.py
Yu Yon 0fd7137cea Initial commit: 知识管理平台
主要功能:
- 多用户管理系统
- 浏览器自动化(Playwright)
- 任务编排和执行
- Docker容器化部署
- 数据持久化和日志管理

技术栈:
- Flask 3.0.0
- Playwright 1.40.0
- SQLite with connection pooling
- Docker + Docker Compose

部署说明详见README.md
2025-11-16 19:03:07 +08:00

763 lines
31 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Playwright版本 - 知识管理系统自动化核心
使用浏览器上下文(Context)实现高性能并发
"""
import os
from pathlib import Path
from playwright.sync_api import sync_playwright, Browser, BrowserContext, Page, Playwright
import time
import threading
from typing import Optional, Callable
from dataclasses import dataclass
# 设置浏览器安装路径避免Nuitka onefile临时目录问题
BROWSERS_PATH = str(Path.home() / "AppData" / "Local" / "ms-playwright")
os.environ["PLAYWRIGHT_BROWSERS_PATH"] = BROWSERS_PATH
# 配置常量
class Config:
"""配置常量"""
LOGIN_URL = "https://postoa.aidunsoft.com/admin/login.aspx"
INDEX_URL_PATTERN = "index.aspx"
PAGE_LOAD_TIMEOUT = 60000 # 毫秒 (increased from 30s to 60s for multi-account support)
DEFAULT_TIMEOUT = 60000 # 增加超时时间以支持多账号并发
MAX_CONCURRENT_CONTEXTS = 100 # 最大并发上下文数
@dataclass
class BrowseResult:
"""浏览结果"""
success: bool
total_items: int = 0
total_attachments: int = 0
error_message: str = ""
class PlaywrightBrowserManager:
"""Playwright浏览器管理器 - 每个账号独立的浏览器实例"""
def __init__(self, headless: bool = True, log_callback: Optional[Callable] = None):
"""
初始化浏览器管理器
Args:
headless: 是否使用无头模式
log_callback: 日志回调函数,签名: log_callback(message, account_id=None)
"""
self.headless = headless
self.log_callback = log_callback
self._lock = threading.Lock()
def log(self, message: str, account_id: Optional[str] = None):
"""记录日志"""
if self.log_callback:
self.log_callback(message, account_id)
def create_browser(self, proxy_config=None):
"""创建新的独立浏览器实例(每个账号独立)"""
try:
self.log("初始化Playwright实例...")
playwright = sync_playwright().start()
self.log("启动独立浏览器进程...")
start_time = time.time()
# 准备浏览器启动参数
launch_options = {
'headless': self.headless,
'args': [
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-extensions',
'--disable-notifications',
'--disable-infobars',
'--disable-default-apps',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
]
}
# 如果有代理配置,添加代理
if proxy_config and proxy_config.get('server'):
launch_options['proxy'] = {
'server': proxy_config['server']
}
self.log(f"使用代理: {proxy_config['server']}")
browser = playwright.chromium.launch(**launch_options)
elapsed = time.time() - start_time
self.log(f"独立浏览器启动成功 (耗时: {elapsed:.2f}秒)")
return playwright, browser
except Exception as e:
self.log(f"启动浏览器失败: {str(e)}")
raise
def create_browser_and_context(self, proxy_config=None):
"""创建独立的浏览器和上下文(每个账号完全隔离)"""
playwright, browser = self.create_browser(proxy_config)
start_time = time.time()
self.log("创建浏览器上下文...")
context = browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
device_scale_factor=2, # 2倍设备像素比提高文字清晰度
)
# 设置默认超时
context.set_default_timeout(Config.DEFAULT_TIMEOUT)
context.set_default_navigation_timeout(Config.PAGE_LOAD_TIMEOUT)
elapsed = time.time() - start_time
self.log(f"上下文创建完成 (耗时: {elapsed:.3f}秒)")
return playwright, browser, context
class PlaywrightAutomation:
"""Playwright自动化操作类"""
def __init__(self, browser_manager: PlaywrightBrowserManager, account_id: str, proxy_config: Optional[dict] = None):
"""
初始化自动化操作
Args:
browser_manager: 浏览器管理器
account_id: 账号ID用于日志
"""
self.browser_manager = browser_manager
self.account_id = account_id
self.proxy_config = proxy_config
self.playwright: Optional[Playwright] = None
self.browser: Optional[Browser] = None
self.context: Optional[BrowserContext] = None
self.page: Optional[Page] = None
self.main_page: Optional[Page] = None
def log(self, message: str):
"""记录日志"""
self.browser_manager.log(message, self.account_id)
def login(self, username: str, password: str, remember: bool = True) -> bool:
"""
登录系统
Args:
username: 用户名
password: 密码
remember: 是否记住密码
Returns:
是否登录成功
"""
try:
self.log("创建浏览器上下文...")
start_time = time.time()
self.playwright, self.browser, self.context = self.browser_manager.create_browser_and_context(self.proxy_config)
elapsed = time.time() - start_time
self.log(f"浏览器和上下文创建完成 (耗时: {elapsed:.3f}秒)")
self.log("创建页面...")
self.page = self.context.new_page()
self.main_page = self.page
self.log("访问登录页面...")
# 使用重试机制处理超时
max_retries = 2
for attempt in range(max_retries):
try:
self.page.goto(Config.LOGIN_URL, timeout=60000)
break
except Exception as e:
if attempt < max_retries - 1:
self.log(f"页面加载超时,重试中... ({attempt + 1}/{max_retries})")
time.sleep(2)
else:
raise
self.log("填写登录信息...")
self.page.fill('#txtUserName', username)
self.page.fill('#txtPassword', password)
if remember:
self.page.check('#chkRemember')
self.log("点击登录按钮...")
self.page.click('#btnSubmit')
# 等待跳转
self.log("等待登录处理...")
self.page.wait_for_load_state('networkidle', timeout=30000) # 增加到30秒
# 检查登录结果
current_url = self.page.url
self.log(f"当前URL: {current_url}")
if Config.INDEX_URL_PATTERN in current_url:
self.log("登录成功!")
return True
else:
self.log("登录失败,请检查用户名和密码")
return False
except Exception as e:
self.log(f"登录过程中出错: {str(e)}")
return False
def switch_to_iframe(self) -> bool:
"""切换到mainframe iframe"""
try:
self.log("查找并切换到iframe...")
# 使用Playwright的等待机制
max_retries = 3
for i in range(max_retries):
try:
# 等待iframe元素出现
self.main_page.wait_for_selector("iframe[name='mainframe']", timeout=2000)
# 获取iframe
iframe = self.main_page.frame('mainframe')
if iframe:
self.page = iframe
self.log(f"✓ 成功切换到iframe (尝试 {i+1}/{max_retries})")
return True
except Exception as e:
if i < max_retries - 1:
self.log(f"未找到iframe重试中... ({i+1}/{max_retries})")
time.sleep(1)
else:
self.log(f"所有重试都失败未找到iframe")
return False
except Exception as e:
self.log(f"切换到iframe时出错: {str(e)}")
return False
def switch_browse_type(self, browse_type: str, max_retries: int = 2) -> bool:
"""
切换浏览类型(带重试机制)
Args:
browse_type: 浏览类型(注册前未读/应读/已读)
max_retries: 最大重试次数(默认2次)
Returns:
是否切换成功
"""
for attempt in range(max_retries + 1):
try:
if attempt > 0:
self.log(f"⚠ 第 {attempt + 1} 次尝试切换浏览类型...")
else:
self.log(f"切换到'{browse_type}'类型...")
# 切换到iframe
if not self.switch_to_iframe():
if attempt < max_retries:
self.log(f"iframe切换失败,等待1秒后重试...")
time.sleep(1)
continue
return False
# 方法1: 尝试查找<a>标签如果JavaScript创建了的话
selector = f"//div[contains(@class, 'rule-multi-radio')]//a[contains(text(), '{browse_type}')]"
try:
# 等待并点击
self.page.locator(selector).click(timeout=5000)
self.log(f"点击'{browse_type}'按钮成功")
# 等待页面刷新并加载内容
time.sleep(1.5)
# 等待表格加载最多等待30秒
try:
self.page.locator("//table[@class='ltable']").wait_for(timeout=30000)
self.log("内容表格已加载")
except Exception as e:
self.log("等待表格加载超时,继续...")
return True
except Exception as e:
error_msg = str(e)
if "Execution context was destroyed" in error_msg:
self.log(f"⚠ 检测到执行上下文被销毁")
if attempt < max_retries:
self.log(f"等待2秒后重试...")
time.sleep(2)
continue
self.log(f"未找到<a>标签,尝试点击<label>...")
# 方法2: 点击label模拟点击radio button
label_selector = f"//label[contains(text(), '{browse_type}')]"
try:
self.page.locator(label_selector).click(timeout=5000)
self.log(f"点击'{browse_type}'标签成功")
# 等待页面刷新并加载内容
time.sleep(1.5)
# 等待表格加载最多等待30秒
try:
self.page.locator("//table[@class='ltable']").wait_for(timeout=30000)
self.log("内容表格已加载")
except Exception as e:
self.log("等待表格加载超时,继续...")
return True
except Exception as e:
error_msg = str(e)
if "Execution context was destroyed" in error_msg:
self.log(f"⚠ 检测到执行上下文被销毁")
if attempt < max_retries:
self.log(f"等待2秒后重试...")
time.sleep(2)
continue
self.log(f"未找到<label>标签")
# 如果两种方法都失败,但还有重试机会
if attempt < max_retries:
self.log(f"切换失败,等待2秒后重试...")
time.sleep(2)
continue
return False
except Exception as e:
error_msg = str(e)
self.log(f"切换浏览类型时出错: {error_msg}")
# 检查是否是 "Execution context was destroyed" 错误
if "Execution context was destroyed" in error_msg or "navigation" in error_msg.lower():
if attempt < max_retries:
self.log(f"⚠ 检测到执行上下文被销毁或导航错误,等待2秒后重试...")
time.sleep(2)
continue
return False
# 所有重试都失败
self.log(f"❌ 切换浏览类型失败,已重试 {max_retries}")
return False
def browse_content(self, browse_type: str,
auto_next_page: bool = True,
auto_view_attachments: bool = True,
interval: float = 1.0,
should_stop_callback: Optional[Callable] = None) -> BrowseResult:
"""
浏览内容
Args:
browse_type: 浏览类型
auto_next_page: 是否自动翻页
auto_view_attachments: 是否自动查看附件
interval: 查看附件的间隔时间(秒)
should_stop_callback: 检查是否应该停止的回调函数
Returns:
浏览结果
"""
result = BrowseResult(success=False)
try:
# 先导航到浏览页面
self.log(f"导航到 '{browse_type}' 页面...")
try:
# 等待页面完全加载
time.sleep(2)
self.log(f"当前URL: {self.main_page.url}")
except Exception as e:
self.log(f"获取URL失败: {str(e)}")
# 切换浏览类型
if not self.switch_browse_type(browse_type):
result.error_message = "切换浏览类型失败"
return result
current_page = 1
total_items = 0
total_attachments = 0
completed_first_round = False
empty_page_counter = 0
while True:
# 检查是否应该停止
if should_stop_callback and should_stop_callback():
self.log("收到停止信号,终止浏览")
break
self.log(f"处理第 {current_page} 页...")
# 确保在iframe中(关键!)
time.sleep(0.2)
self.page = self.main_page.frame('mainframe')
if not self.page:
self.log("错误无法获取iframe")
break
# 额外等待确保AJAX内容加载完成
time.sleep(0.5)
# 获取内容行数量
rows_locator = self.page.locator("//table[@class='ltable']/tbody/tr[position()>1 and count(td)>=5]")
rows_count = rows_locator.count()
if rows_count == 0:
self.log("当前页面没有内容")
empty_page_counter += 1
self.log(f"连续空页面数: {empty_page_counter}")
# 检查是否已完成至少一轮浏览且连续空页面数达到阈值
if completed_first_round and empty_page_counter >= 2:
self.log("检测到连续空页面且已完成至少一轮浏览,内容已浏览完毕")
break
# 尝试翻页或返回第一页
if auto_next_page:
# 检查是否有下一页
try:
next_button = self.page.locator("//div[@id='PageContent']/a[contains(text(), '下一页') or contains(text(), '»')]")
if next_button.count() > 0:
self.log("点击下一页...")
next_button.click()
time.sleep(1.5)
current_page += 1
continue
else:
# 没有下一页,返回第一页
if not completed_first_round:
completed_first_round = True
self.log("完成第一轮浏览,准备返回第一页继续浏览...")
else:
self.log("完成一轮浏览,返回第一页继续...")
# 刷新页面并重新点击浏览类型
self.log("刷新页面并重新点击浏览类型...")
self.main_page.reload()
time.sleep(1.5)
# 切换到iframe
time.sleep(0.5)
self.page = self.main_page.frame('mainframe')
# 重新点击浏览类型按钮
selector = f"//div[contains(@class, 'rule-multi-radio')]//a[contains(text(), '{browse_type}')]"
try:
self.page.locator(selector).click(timeout=5000)
self.log(f"重新点击'{browse_type}'按钮成功")
time.sleep(1.5)
# 等待表格加载
try:
self.page.locator("//table[@class='ltable']").wait_for(timeout=30000) # 增加到30秒
self.log("内容表格已加载")
except Exception as e:
self.log("等待表格加载超时,继续...")
except Exception as e:
# 尝试点击label
label_selector = f"//label[contains(text(), '{browse_type}')]"
self.page.locator(label_selector).click(timeout=5000)
self.log(f"点击'{browse_type}'标签成功")
time.sleep(1.5)
current_page = 1
continue
except Exception as e:
self.log(f"翻页时出错: {str(e)}")
break
else:
break
# 找到内容,重置空页面计数
empty_page_counter = 0
self.log(f"找到 {rows_count} 条内容")
# 处理每一行 (每次从头重新获取所有行)
for i in range(rows_count):
if should_stop_callback and should_stop_callback():
break
# 每次处理新行前,确保在iframe中(关键!尤其是history.back()后)
if i > 0:
time.sleep(0.2)
self.page = self.main_page.frame('mainframe')
# 每次都重新获取rows_locator和row,确保元素是最新的
current_rows_locator = self.page.locator("//table[@class='ltable']/tbody/tr[position()>1 and count(td)>=5]")
row = current_rows_locator.nth(i)
# 获取标题 (使用xpath:)
title_cell = row.locator("xpath=.//td[4]")
title = title_cell.inner_text().strip()
self.log(f" [{i+1}] {title[:50]}")
total_items += 1
# 处理附件 (使用xpath:)
if auto_view_attachments:
# 每次都重新获取附件链接数量
att_links_locator = row.locator("xpath=.//td[5]//a[contains(@class, 'link-btn')]")
att_count = att_links_locator.count()
if att_count > 0:
# 只处理第一个附件
att_link = att_links_locator.first
att_text = att_link.inner_text().strip() or "附件"
self.log(f" - 处理{att_text}...")
try:
# 记录点击前的页面数量
pages_before = len(self.context.pages)
# 点击附件
att_link.click()
# 快速检测是否有新窗口0.5秒足够)
time.sleep(0.5)
# 检查是否有新窗口
pages_after = self.context.pages
if len(pages_after) > pages_before:
# 有新窗口打开
new_page = pages_after[-1]
self.log(f" - 新窗口已打开,等待加载...")
time.sleep(interval) # 使用用户设置的间隔
# 关闭新窗口
new_page.close()
self.log(f" - 新窗口已关闭")
else:
# 没有新窗口使用浏览器返回像Selenium版本一样
# 关键问题iframe内点击附件不会触发真正的导航
# Selenium的driver.back()不等待Playwright的go_back()会等待导航
# 解决方案使用JavaScript执行history.back(),不等待导航
self.main_page.evaluate("() => window.history.back()")
time.sleep(0.5)
# 确保回到iframe中
self.page = self.main_page.frame('mainframe')
# 确保回到iframe中
time.sleep(0.2)
self.page = self.main_page.frame('mainframe')
total_attachments += 1
self.log(f" - {att_text}处理完成")
except Exception as e:
self.log(f" - 处理{att_text}时出错: {str(e)}")
# 发生错误时尝试恢复到iframe
try:
# 尝试重新获取iframe
iframe = self.main_page.frame('mainframe')
if iframe:
self.page = iframe
else:
# 如果找不到iframe可能需要刷新
self.log(f" - 找不到iframe刷新页面...")
self.main_page.reload()
time.sleep(1)
if self.switch_browse_type(browse_type):
self.page = self.main_page.frame('mainframe')
except Exception as e:
pass
# 处理完当前页后,检查是否需要翻页
if auto_next_page:
try:
# 确保在iframe中
time.sleep(0.2)
self.page = self.main_page.frame('mainframe')
# 检查是否有下一页
next_button = self.page.locator("//div[@id='PageContent']/a[contains(text(), '下一页') or contains(text(), '»')]")
if next_button.count() > 0:
self.log("点击下一页...")
next_button.click()
time.sleep(1.5)
current_page += 1
# 继续下一页的循环
else:
# 没有下一页了,返回第一页继续
if not completed_first_round:
completed_first_round = True
self.log("完成第一轮浏览,准备返回第一页继续浏览...")
else:
self.log("完成一轮浏览,返回第一页继续...")
# 刷新页面并重新点击浏览类型
self.log("刷新页面并重新点击浏览类型...")
self.main_page.reload()
time.sleep(1.5)
# 切换到iframe
time.sleep(0.5)
self.page = self.main_page.frame('mainframe')
# 重新点击浏览类型按钮
selector = f"//div[contains(@class, 'rule-multi-radio')]//a[contains(text(), '{browse_type}')]"
try:
self.page.locator(selector).click(timeout=5000)
self.log(f"重新点击'{browse_type}'按钮成功")
time.sleep(1.5)
# 等待表格加载
try:
self.page.locator("//table[@class='ltable']").wait_for(timeout=30000) # 增加到30秒
self.log("内容表格已加载")
except Exception as e:
self.log("等待表格加载超时,继续...")
except Exception as e:
# 尝试点击label
label_selector = f"//label[contains(text(), '{browse_type}')]"
self.page.locator(label_selector).click(timeout=5000)
self.log(f"点击'{browse_type}'标签成功")
time.sleep(1.5)
current_page = 1
# 继续循环,从第一页开始
except Exception as e:
self.log(f"翻页时出错: {str(e)}")
break
result.success = True
result.total_items = total_items
result.total_attachments = total_attachments
self.log(f"浏览完成!共 {total_items} 条内容,{total_attachments} 个附件")
except Exception as e:
result.error_message = str(e)
self.log(f"浏览内容时出错: {str(e)}")
return result
def take_screenshot(self, filepath: str) -> bool:
"""
截图
Args:
filepath: 截图保存路径
Returns:
是否截图成功
"""
try:
# 使用最高质量设置截图
# type='jpeg' 指定JPEG格式支持quality参数
# quality=100 表示100%的JPEG质量范围0-100最高质量
# full_page=True 表示截取整个页面
# 视口分辨率 2560x1440 确保高清晰度
# 这样可以生成更清晰的截图大小约500KB-1MB左右
self.main_page.screenshot(
path=filepath,
type='jpeg',
full_page=True,
quality=100
)
self.log(f"截图已保存: {filepath}")
return True
except Exception as e:
self.log(f"截图失败: {str(e)}")
return False
def close(self):
"""完全关闭浏览器进程(每个账号独立)并确保资源释放"""
errors = []
# 第一步:关闭上下文
if self.context:
try:
self.context.close()
self.log("上下文已关闭")
except Exception as e:
error_msg = f"关闭上下文时出错: {str(e)}"
self.log(error_msg)
errors.append(error_msg)
# 第二步:关闭浏览器进程
if self.browser:
try:
self.browser.close()
self.log("浏览器进程已关闭")
except Exception as e:
error_msg = f"关闭浏览器时出错: {str(e)}"
self.log(error_msg)
errors.append(error_msg)
# 第三步:停止Playwright
if self.playwright:
try:
self.playwright.stop()
self.log("Playwright已停止")
except Exception as e:
error_msg = f"停止Playwright时出错: {str(e)}"
self.log(error_msg)
errors.append(error_msg)
# 第四步:清空引用,确保垃圾回收
self.context = None
self.page = None
self.main_page = None
self.browser = None
self.playwright = None
# 第五步:强制等待,确保进程完全退出
time.sleep(0.5)
if errors:
self.log(f"资源清理完成,但有{len(errors)}个警告")
else:
self.log("资源清理完成")
# 简单的测试函数
if __name__ == "__main__":
print("Playwright自动化核心 - 测试")
print("="*60)
# 创建浏览器管理器
manager = PlaywrightBrowserManager(headless=True)
try:
# 初始化浏览器
manager.initialize()
# 创建自动化实例
automation = PlaywrightAutomation(manager, "test_account")
# 登录
if automation.login("19174616018", "aa123456"):
# 浏览内容
result = automation.browse_content(
browse_type="应读",
auto_next_page=True,
auto_view_attachments=True,
interval=2.0 # 增加间隔时间
)
print(f"\n浏览结果: {result}")
# 关闭
automation.close()
finally:
# 关闭浏览器管理器
manager.close()
print("="*60)
print("测试完成")