Initial commit: 知识管理平台

主要功能:
- 多用户管理系统
- 浏览器自动化(Playwright)
- 任务编排和执行
- Docker容器化部署
- 数据持久化和日志管理

技术栈:
- Flask 3.0.0
- Playwright 1.40.0
- SQLite with connection pooling
- Docker + Docker Compose

部署说明详见README.md
This commit is contained in:
Yu Yon
2025-11-16 19:03:07 +08:00
commit 0fd7137cea
23 changed files with 12061 additions and 0 deletions

762
playwright_automation.py Executable file
View File

@@ -0,0 +1,762 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Playwright版本 - 知识管理系统自动化核心
使用浏览器上下文(Context)实现高性能并发
"""
import os
from pathlib import Path
from playwright.sync_api import sync_playwright, Browser, BrowserContext, Page, Playwright
import time
import threading
from typing import Optional, Callable
from dataclasses import dataclass
# 设置浏览器安装路径避免Nuitka onefile临时目录问题
BROWSERS_PATH = str(Path.home() / "AppData" / "Local" / "ms-playwright")
os.environ["PLAYWRIGHT_BROWSERS_PATH"] = BROWSERS_PATH
# 配置常量
class Config:
"""配置常量"""
LOGIN_URL = "https://postoa.aidunsoft.com/admin/login.aspx"
INDEX_URL_PATTERN = "index.aspx"
PAGE_LOAD_TIMEOUT = 60000 # 毫秒 (increased from 30s to 60s for multi-account support)
DEFAULT_TIMEOUT = 60000 # 增加超时时间以支持多账号并发
MAX_CONCURRENT_CONTEXTS = 100 # 最大并发上下文数
@dataclass
class BrowseResult:
"""浏览结果"""
success: bool
total_items: int = 0
total_attachments: int = 0
error_message: str = ""
class PlaywrightBrowserManager:
"""Playwright浏览器管理器 - 每个账号独立的浏览器实例"""
def __init__(self, headless: bool = True, log_callback: Optional[Callable] = None):
"""
初始化浏览器管理器
Args:
headless: 是否使用无头模式
log_callback: 日志回调函数,签名: log_callback(message, account_id=None)
"""
self.headless = headless
self.log_callback = log_callback
self._lock = threading.Lock()
def log(self, message: str, account_id: Optional[str] = None):
"""记录日志"""
if self.log_callback:
self.log_callback(message, account_id)
def create_browser(self, proxy_config=None):
"""创建新的独立浏览器实例(每个账号独立)"""
try:
self.log("初始化Playwright实例...")
playwright = sync_playwright().start()
self.log("启动独立浏览器进程...")
start_time = time.time()
# 准备浏览器启动参数
launch_options = {
'headless': self.headless,
'args': [
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--disable-extensions',
'--disable-notifications',
'--disable-infobars',
'--disable-default-apps',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-renderer-backgrounding',
]
}
# 如果有代理配置,添加代理
if proxy_config and proxy_config.get('server'):
launch_options['proxy'] = {
'server': proxy_config['server']
}
self.log(f"使用代理: {proxy_config['server']}")
browser = playwright.chromium.launch(**launch_options)
elapsed = time.time() - start_time
self.log(f"独立浏览器启动成功 (耗时: {elapsed:.2f}秒)")
return playwright, browser
except Exception as e:
self.log(f"启动浏览器失败: {str(e)}")
raise
def create_browser_and_context(self, proxy_config=None):
"""创建独立的浏览器和上下文(每个账号完全隔离)"""
playwright, browser = self.create_browser(proxy_config)
start_time = time.time()
self.log("创建浏览器上下文...")
context = browser.new_context(
viewport={'width': 1920, 'height': 1080},
user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
device_scale_factor=2, # 2倍设备像素比提高文字清晰度
)
# 设置默认超时
context.set_default_timeout(Config.DEFAULT_TIMEOUT)
context.set_default_navigation_timeout(Config.PAGE_LOAD_TIMEOUT)
elapsed = time.time() - start_time
self.log(f"上下文创建完成 (耗时: {elapsed:.3f}秒)")
return playwright, browser, context
class PlaywrightAutomation:
"""Playwright自动化操作类"""
def __init__(self, browser_manager: PlaywrightBrowserManager, account_id: str, proxy_config: Optional[dict] = None):
"""
初始化自动化操作
Args:
browser_manager: 浏览器管理器
account_id: 账号ID用于日志
"""
self.browser_manager = browser_manager
self.account_id = account_id
self.proxy_config = proxy_config
self.playwright: Optional[Playwright] = None
self.browser: Optional[Browser] = None
self.context: Optional[BrowserContext] = None
self.page: Optional[Page] = None
self.main_page: Optional[Page] = None
def log(self, message: str):
"""记录日志"""
self.browser_manager.log(message, self.account_id)
def login(self, username: str, password: str, remember: bool = True) -> bool:
"""
登录系统
Args:
username: 用户名
password: 密码
remember: 是否记住密码
Returns:
是否登录成功
"""
try:
self.log("创建浏览器上下文...")
start_time = time.time()
self.playwright, self.browser, self.context = self.browser_manager.create_browser_and_context(self.proxy_config)
elapsed = time.time() - start_time
self.log(f"浏览器和上下文创建完成 (耗时: {elapsed:.3f}秒)")
self.log("创建页面...")
self.page = self.context.new_page()
self.main_page = self.page
self.log("访问登录页面...")
# 使用重试机制处理超时
max_retries = 2
for attempt in range(max_retries):
try:
self.page.goto(Config.LOGIN_URL, timeout=60000)
break
except Exception as e:
if attempt < max_retries - 1:
self.log(f"页面加载超时,重试中... ({attempt + 1}/{max_retries})")
time.sleep(2)
else:
raise
self.log("填写登录信息...")
self.page.fill('#txtUserName', username)
self.page.fill('#txtPassword', password)
if remember:
self.page.check('#chkRemember')
self.log("点击登录按钮...")
self.page.click('#btnSubmit')
# 等待跳转
self.log("等待登录处理...")
self.page.wait_for_load_state('networkidle', timeout=30000) # 增加到30秒
# 检查登录结果
current_url = self.page.url
self.log(f"当前URL: {current_url}")
if Config.INDEX_URL_PATTERN in current_url:
self.log("登录成功!")
return True
else:
self.log("登录失败,请检查用户名和密码")
return False
except Exception as e:
self.log(f"登录过程中出错: {str(e)}")
return False
def switch_to_iframe(self) -> bool:
"""切换到mainframe iframe"""
try:
self.log("查找并切换到iframe...")
# 使用Playwright的等待机制
max_retries = 3
for i in range(max_retries):
try:
# 等待iframe元素出现
self.main_page.wait_for_selector("iframe[name='mainframe']", timeout=2000)
# 获取iframe
iframe = self.main_page.frame('mainframe')
if iframe:
self.page = iframe
self.log(f"✓ 成功切换到iframe (尝试 {i+1}/{max_retries})")
return True
except Exception as e:
if i < max_retries - 1:
self.log(f"未找到iframe重试中... ({i+1}/{max_retries})")
time.sleep(1)
else:
self.log(f"所有重试都失败未找到iframe")
return False
except Exception as e:
self.log(f"切换到iframe时出错: {str(e)}")
return False
def switch_browse_type(self, browse_type: str, max_retries: int = 2) -> bool:
"""
切换浏览类型(带重试机制)
Args:
browse_type: 浏览类型(注册前未读/应读/已读)
max_retries: 最大重试次数(默认2次)
Returns:
是否切换成功
"""
for attempt in range(max_retries + 1):
try:
if attempt > 0:
self.log(f"⚠ 第 {attempt + 1} 次尝试切换浏览类型...")
else:
self.log(f"切换到'{browse_type}'类型...")
# 切换到iframe
if not self.switch_to_iframe():
if attempt < max_retries:
self.log(f"iframe切换失败,等待1秒后重试...")
time.sleep(1)
continue
return False
# 方法1: 尝试查找<a>标签如果JavaScript创建了的话
selector = f"//div[contains(@class, 'rule-multi-radio')]//a[contains(text(), '{browse_type}')]"
try:
# 等待并点击
self.page.locator(selector).click(timeout=5000)
self.log(f"点击'{browse_type}'按钮成功")
# 等待页面刷新并加载内容
time.sleep(1.5)
# 等待表格加载最多等待30秒
try:
self.page.locator("//table[@class='ltable']").wait_for(timeout=30000)
self.log("内容表格已加载")
except Exception as e:
self.log("等待表格加载超时,继续...")
return True
except Exception as e:
error_msg = str(e)
if "Execution context was destroyed" in error_msg:
self.log(f"⚠ 检测到执行上下文被销毁")
if attempt < max_retries:
self.log(f"等待2秒后重试...")
time.sleep(2)
continue
self.log(f"未找到<a>标签,尝试点击<label>...")
# 方法2: 点击label模拟点击radio button
label_selector = f"//label[contains(text(), '{browse_type}')]"
try:
self.page.locator(label_selector).click(timeout=5000)
self.log(f"点击'{browse_type}'标签成功")
# 等待页面刷新并加载内容
time.sleep(1.5)
# 等待表格加载最多等待30秒
try:
self.page.locator("//table[@class='ltable']").wait_for(timeout=30000)
self.log("内容表格已加载")
except Exception as e:
self.log("等待表格加载超时,继续...")
return True
except Exception as e:
error_msg = str(e)
if "Execution context was destroyed" in error_msg:
self.log(f"⚠ 检测到执行上下文被销毁")
if attempt < max_retries:
self.log(f"等待2秒后重试...")
time.sleep(2)
continue
self.log(f"未找到<label>标签")
# 如果两种方法都失败,但还有重试机会
if attempt < max_retries:
self.log(f"切换失败,等待2秒后重试...")
time.sleep(2)
continue
return False
except Exception as e:
error_msg = str(e)
self.log(f"切换浏览类型时出错: {error_msg}")
# 检查是否是 "Execution context was destroyed" 错误
if "Execution context was destroyed" in error_msg or "navigation" in error_msg.lower():
if attempt < max_retries:
self.log(f"⚠ 检测到执行上下文被销毁或导航错误,等待2秒后重试...")
time.sleep(2)
continue
return False
# 所有重试都失败
self.log(f"❌ 切换浏览类型失败,已重试 {max_retries}")
return False
def browse_content(self, browse_type: str,
auto_next_page: bool = True,
auto_view_attachments: bool = True,
interval: float = 1.0,
should_stop_callback: Optional[Callable] = None) -> BrowseResult:
"""
浏览内容
Args:
browse_type: 浏览类型
auto_next_page: 是否自动翻页
auto_view_attachments: 是否自动查看附件
interval: 查看附件的间隔时间(秒)
should_stop_callback: 检查是否应该停止的回调函数
Returns:
浏览结果
"""
result = BrowseResult(success=False)
try:
# 先导航到浏览页面
self.log(f"导航到 '{browse_type}' 页面...")
try:
# 等待页面完全加载
time.sleep(2)
self.log(f"当前URL: {self.main_page.url}")
except Exception as e:
self.log(f"获取URL失败: {str(e)}")
# 切换浏览类型
if not self.switch_browse_type(browse_type):
result.error_message = "切换浏览类型失败"
return result
current_page = 1
total_items = 0
total_attachments = 0
completed_first_round = False
empty_page_counter = 0
while True:
# 检查是否应该停止
if should_stop_callback and should_stop_callback():
self.log("收到停止信号,终止浏览")
break
self.log(f"处理第 {current_page} 页...")
# 确保在iframe中(关键!)
time.sleep(0.2)
self.page = self.main_page.frame('mainframe')
if not self.page:
self.log("错误无法获取iframe")
break
# 额外等待确保AJAX内容加载完成
time.sleep(0.5)
# 获取内容行数量
rows_locator = self.page.locator("//table[@class='ltable']/tbody/tr[position()>1 and count(td)>=5]")
rows_count = rows_locator.count()
if rows_count == 0:
self.log("当前页面没有内容")
empty_page_counter += 1
self.log(f"连续空页面数: {empty_page_counter}")
# 检查是否已完成至少一轮浏览且连续空页面数达到阈值
if completed_first_round and empty_page_counter >= 2:
self.log("检测到连续空页面且已完成至少一轮浏览,内容已浏览完毕")
break
# 尝试翻页或返回第一页
if auto_next_page:
# 检查是否有下一页
try:
next_button = self.page.locator("//div[@id='PageContent']/a[contains(text(), '下一页') or contains(text(), '»')]")
if next_button.count() > 0:
self.log("点击下一页...")
next_button.click()
time.sleep(1.5)
current_page += 1
continue
else:
# 没有下一页,返回第一页
if not completed_first_round:
completed_first_round = True
self.log("完成第一轮浏览,准备返回第一页继续浏览...")
else:
self.log("完成一轮浏览,返回第一页继续...")
# 刷新页面并重新点击浏览类型
self.log("刷新页面并重新点击浏览类型...")
self.main_page.reload()
time.sleep(1.5)
# 切换到iframe
time.sleep(0.5)
self.page = self.main_page.frame('mainframe')
# 重新点击浏览类型按钮
selector = f"//div[contains(@class, 'rule-multi-radio')]//a[contains(text(), '{browse_type}')]"
try:
self.page.locator(selector).click(timeout=5000)
self.log(f"重新点击'{browse_type}'按钮成功")
time.sleep(1.5)
# 等待表格加载
try:
self.page.locator("//table[@class='ltable']").wait_for(timeout=30000) # 增加到30秒
self.log("内容表格已加载")
except Exception as e:
self.log("等待表格加载超时,继续...")
except Exception as e:
# 尝试点击label
label_selector = f"//label[contains(text(), '{browse_type}')]"
self.page.locator(label_selector).click(timeout=5000)
self.log(f"点击'{browse_type}'标签成功")
time.sleep(1.5)
current_page = 1
continue
except Exception as e:
self.log(f"翻页时出错: {str(e)}")
break
else:
break
# 找到内容,重置空页面计数
empty_page_counter = 0
self.log(f"找到 {rows_count} 条内容")
# 处理每一行 (每次从头重新获取所有行)
for i in range(rows_count):
if should_stop_callback and should_stop_callback():
break
# 每次处理新行前,确保在iframe中(关键!尤其是history.back()后)
if i > 0:
time.sleep(0.2)
self.page = self.main_page.frame('mainframe')
# 每次都重新获取rows_locator和row,确保元素是最新的
current_rows_locator = self.page.locator("//table[@class='ltable']/tbody/tr[position()>1 and count(td)>=5]")
row = current_rows_locator.nth(i)
# 获取标题 (使用xpath:)
title_cell = row.locator("xpath=.//td[4]")
title = title_cell.inner_text().strip()
self.log(f" [{i+1}] {title[:50]}")
total_items += 1
# 处理附件 (使用xpath:)
if auto_view_attachments:
# 每次都重新获取附件链接数量
att_links_locator = row.locator("xpath=.//td[5]//a[contains(@class, 'link-btn')]")
att_count = att_links_locator.count()
if att_count > 0:
# 只处理第一个附件
att_link = att_links_locator.first
att_text = att_link.inner_text().strip() or "附件"
self.log(f" - 处理{att_text}...")
try:
# 记录点击前的页面数量
pages_before = len(self.context.pages)
# 点击附件
att_link.click()
# 快速检测是否有新窗口0.5秒足够)
time.sleep(0.5)
# 检查是否有新窗口
pages_after = self.context.pages
if len(pages_after) > pages_before:
# 有新窗口打开
new_page = pages_after[-1]
self.log(f" - 新窗口已打开,等待加载...")
time.sleep(interval) # 使用用户设置的间隔
# 关闭新窗口
new_page.close()
self.log(f" - 新窗口已关闭")
else:
# 没有新窗口使用浏览器返回像Selenium版本一样
# 关键问题iframe内点击附件不会触发真正的导航
# Selenium的driver.back()不等待Playwright的go_back()会等待导航
# 解决方案使用JavaScript执行history.back(),不等待导航
self.main_page.evaluate("() => window.history.back()")
time.sleep(0.5)
# 确保回到iframe中
self.page = self.main_page.frame('mainframe')
# 确保回到iframe中
time.sleep(0.2)
self.page = self.main_page.frame('mainframe')
total_attachments += 1
self.log(f" - {att_text}处理完成")
except Exception as e:
self.log(f" - 处理{att_text}时出错: {str(e)}")
# 发生错误时尝试恢复到iframe
try:
# 尝试重新获取iframe
iframe = self.main_page.frame('mainframe')
if iframe:
self.page = iframe
else:
# 如果找不到iframe可能需要刷新
self.log(f" - 找不到iframe刷新页面...")
self.main_page.reload()
time.sleep(1)
if self.switch_browse_type(browse_type):
self.page = self.main_page.frame('mainframe')
except Exception as e:
pass
# 处理完当前页后,检查是否需要翻页
if auto_next_page:
try:
# 确保在iframe中
time.sleep(0.2)
self.page = self.main_page.frame('mainframe')
# 检查是否有下一页
next_button = self.page.locator("//div[@id='PageContent']/a[contains(text(), '下一页') or contains(text(), '»')]")
if next_button.count() > 0:
self.log("点击下一页...")
next_button.click()
time.sleep(1.5)
current_page += 1
# 继续下一页的循环
else:
# 没有下一页了,返回第一页继续
if not completed_first_round:
completed_first_round = True
self.log("完成第一轮浏览,准备返回第一页继续浏览...")
else:
self.log("完成一轮浏览,返回第一页继续...")
# 刷新页面并重新点击浏览类型
self.log("刷新页面并重新点击浏览类型...")
self.main_page.reload()
time.sleep(1.5)
# 切换到iframe
time.sleep(0.5)
self.page = self.main_page.frame('mainframe')
# 重新点击浏览类型按钮
selector = f"//div[contains(@class, 'rule-multi-radio')]//a[contains(text(), '{browse_type}')]"
try:
self.page.locator(selector).click(timeout=5000)
self.log(f"重新点击'{browse_type}'按钮成功")
time.sleep(1.5)
# 等待表格加载
try:
self.page.locator("//table[@class='ltable']").wait_for(timeout=30000) # 增加到30秒
self.log("内容表格已加载")
except Exception as e:
self.log("等待表格加载超时,继续...")
except Exception as e:
# 尝试点击label
label_selector = f"//label[contains(text(), '{browse_type}')]"
self.page.locator(label_selector).click(timeout=5000)
self.log(f"点击'{browse_type}'标签成功")
time.sleep(1.5)
current_page = 1
# 继续循环,从第一页开始
except Exception as e:
self.log(f"翻页时出错: {str(e)}")
break
result.success = True
result.total_items = total_items
result.total_attachments = total_attachments
self.log(f"浏览完成!共 {total_items} 条内容,{total_attachments} 个附件")
except Exception as e:
result.error_message = str(e)
self.log(f"浏览内容时出错: {str(e)}")
return result
def take_screenshot(self, filepath: str) -> bool:
"""
截图
Args:
filepath: 截图保存路径
Returns:
是否截图成功
"""
try:
# 使用最高质量设置截图
# type='jpeg' 指定JPEG格式支持quality参数
# quality=100 表示100%的JPEG质量范围0-100最高质量
# full_page=True 表示截取整个页面
# 视口分辨率 2560x1440 确保高清晰度
# 这样可以生成更清晰的截图大小约500KB-1MB左右
self.main_page.screenshot(
path=filepath,
type='jpeg',
full_page=True,
quality=100
)
self.log(f"截图已保存: {filepath}")
return True
except Exception as e:
self.log(f"截图失败: {str(e)}")
return False
def close(self):
"""完全关闭浏览器进程(每个账号独立)并确保资源释放"""
errors = []
# 第一步:关闭上下文
if self.context:
try:
self.context.close()
self.log("上下文已关闭")
except Exception as e:
error_msg = f"关闭上下文时出错: {str(e)}"
self.log(error_msg)
errors.append(error_msg)
# 第二步:关闭浏览器进程
if self.browser:
try:
self.browser.close()
self.log("浏览器进程已关闭")
except Exception as e:
error_msg = f"关闭浏览器时出错: {str(e)}"
self.log(error_msg)
errors.append(error_msg)
# 第三步:停止Playwright
if self.playwright:
try:
self.playwright.stop()
self.log("Playwright已停止")
except Exception as e:
error_msg = f"停止Playwright时出错: {str(e)}"
self.log(error_msg)
errors.append(error_msg)
# 第四步:清空引用,确保垃圾回收
self.context = None
self.page = None
self.main_page = None
self.browser = None
self.playwright = None
# 第五步:强制等待,确保进程完全退出
time.sleep(0.5)
if errors:
self.log(f"资源清理完成,但有{len(errors)}个警告")
else:
self.log("资源清理完成")
# 简单的测试函数
if __name__ == "__main__":
print("Playwright自动化核心 - 测试")
print("="*60)
# 创建浏览器管理器
manager = PlaywrightBrowserManager(headless=True)
try:
# 初始化浏览器
manager.initialize()
# 创建自动化实例
automation = PlaywrightAutomation(manager, "test_account")
# 登录
if automation.login("19174616018", "aa123456"):
# 浏览内容
result = automation.browse_content(
browse_type="应读",
auto_next_page=True,
auto_view_attachments=True,
interval=2.0 # 增加间隔时间
)
print(f"\n浏览结果: {result}")
# 关闭
automation.close()
finally:
# 关闭浏览器管理器
manager.close()
print("="*60)
print("测试完成")