zsglpt/playwright_automation.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Playwright版本 - 知识管理系统自动化核心
使用浏览器上下文(Context)实现高性能并发
"""

import os
from pathlib import Path
from playwright.sync_api import sync_playwright, Browser, BrowserContext, Page, Playwright
import time
import threading
from typing import Optional, Callable
from dataclasses import dataclass

# 设置浏览器安装路径（避免Nuitka onefile临时目录问题）
BROWSERS_PATH = str(Path.home() / "AppData" / "Local" / "ms-playwright")
os.environ["PLAYWRIGHT_BROWSERS_PATH"] = BROWSERS_PATH

# 配置常量
class Config:
    """配置常量"""
    LOGIN_URL = "https://postoa.aidunsoft.com/admin/login.aspx"
    INDEX_URL_PATTERN = "index.aspx"

    PAGE_LOAD_TIMEOUT = 60000  # 毫秒 (increased from 30s to 60s for multi-account support)
    DEFAULT_TIMEOUT = 60000  # 增加超时时间以支持多账号并发

    MAX_CONCURRENT_CONTEXTS = 100  # 最大并发上下文数


@dataclass
class BrowseResult:
    """浏览结果"""
    success: bool
    total_items: int = 0
    total_attachments: int = 0
    error_message: str = ""


class PlaywrightBrowserManager:
    """Playwright浏览器管理器 - 每个账号独立的浏览器实例"""

    def __init__(self, headless: bool = True, log_callback: Optional[Callable] = None):
        """
        初始化浏览器管理器

        Args:
            headless: 是否使用无头模式
            log_callback: 日志回调函数，签名: log_callback(message, account_id=None)
        """
        self.headless = headless
        self.log_callback = log_callback
        self._lock = threading.Lock()

    def log(self, message: str, account_id: Optional[str] = None):
        """记录日志"""
        if self.log_callback:
            self.log_callback(message, account_id)

    def create_browser(self, proxy_config=None):
        """创建新的独立浏览器实例（每个账号独立）"""
        try:
            self.log("初始化Playwright实例...")
            playwright = sync_playwright().start()

            self.log("启动独立浏览器进程...")
            start_time = time.time()

            # 准备浏览器启动参数
            launch_options = {
                'headless': self.headless,
                'args': [
                    '--no-sandbox',
                    '--disable-dev-shm-usage',
                    '--disable-gpu',
                    '--disable-extensions',
                    '--disable-notifications',
                    '--disable-infobars',
                    '--disable-default-apps',
                    '--disable-background-timer-throttling',
                    '--disable-backgrounding-occluded-windows',
                    '--disable-renderer-backgrounding',
                ]
            }

            # 如果有代理配置，添加代理
            if proxy_config and proxy_config.get('server'):
                launch_options['proxy'] = {
                    'server': proxy_config['server']
                }
                self.log(f"使用代理: {proxy_config['server']}")

            browser = playwright.chromium.launch(**launch_options)

            elapsed = time.time() - start_time
            self.log(f"独立浏览器启动成功 (耗时: {elapsed:.2f}秒)")

            return playwright, browser

        except Exception as e:
            self.log(f"启动浏览器失败: {str(e)}")
            raise

    def create_browser_and_context(self, proxy_config=None):
        """创建独立的浏览器和上下文（每个账号完全隔离）"""
        playwright, browser = self.create_browser(proxy_config)

        start_time = time.time()
        self.log("创建浏览器上下文...")

        context = browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            device_scale_factor=2,  # 2倍设备像素比，提高文字清晰度
        )

        # 设置默认超时
        context.set_default_timeout(Config.DEFAULT_TIMEOUT)
        context.set_default_navigation_timeout(Config.PAGE_LOAD_TIMEOUT)

        elapsed = time.time() - start_time
        self.log(f"上下文创建完成 (耗时: {elapsed:.3f}秒)")

        return playwright, browser, context


class PlaywrightAutomation:
    """Playwright自动化操作类"""

    def __init__(self, browser_manager: PlaywrightBrowserManager, account_id: str, proxy_config: Optional[dict] = None):
        """
        初始化自动化操作

        Args:
            browser_manager: 浏览器管理器
            account_id: 账号ID（用于日志）
        """
        self.browser_manager = browser_manager
        self.account_id = account_id
        self.proxy_config = proxy_config
        self.playwright: Optional[Playwright] = None
        self.browser: Optional[Browser] = None
        self.context: Optional[BrowserContext] = None
        self.page: Optional[Page] = None
        self.main_page: Optional[Page] = None

    def log(self, message: str):
        """记录日志"""
        self.browser_manager.log(message, self.account_id)

    def login(self, username: str, password: str, remember: bool = True) -> bool:
        """
        登录系统

        Args:
            username: 用户名
            password: 密码
            remember: 是否记住密码

        Returns:
            是否登录成功
        """
        try:
            self.log("创建浏览器上下文...")
            start_time = time.time()
            self.playwright, self.browser, self.context = self.browser_manager.create_browser_and_context(self.proxy_config)
            elapsed = time.time() - start_time
            self.log(f"浏览器和上下文创建完成 (耗时: {elapsed:.3f}秒)")

            self.log("创建页面...")
            self.page = self.context.new_page()
            self.main_page = self.page

            self.log("访问登录页面...")
            # 使用重试机制处理超时
            max_retries = 2
            for attempt in range(max_retries):
                try:
                    self.page.goto(Config.LOGIN_URL, timeout=60000)
                    break
                except Exception as e:
                    if attempt < max_retries - 1:
                        self.log(f"页面加载超时，重试中... ({attempt + 1}/{max_retries})")
                        time.sleep(2)
                    else:
                        raise

            self.log("填写登录信息...")
            self.page.fill('#txtUserName', username)
            self.page.fill('#txtPassword', password)

            if remember:
                self.page.check('#chkRemember')

            self.log("点击登录按钮...")
            self.page.click('#btnSubmit')

            # 等待跳转
            self.log("等待登录处理...")
            self.page.wait_for_load_state('networkidle', timeout=30000)  # 增加到30秒

            # 检查登录结果
            current_url = self.page.url
            self.log(f"当前URL: {current_url}")

            if Config.INDEX_URL_PATTERN in current_url:
                self.log("登录成功!")
                return True
            else:
                self.log("登录失败，请检查用户名和密码")
                return False

        except Exception as e:
            self.log(f"登录过程中出错: {str(e)}")
            return False

    def switch_to_iframe(self) -> bool:
        """切换到mainframe iframe"""
        try:
            self.log("查找并切换到iframe...")

            # 使用Playwright的等待机制
            max_retries = 3
            for i in range(max_retries):
                try:
                    # 等待iframe元素出现
                    self.main_page.wait_for_selector("iframe[name='mainframe']", timeout=2000)

                    # 获取iframe
                    iframe = self.main_page.frame('mainframe')
                    if iframe:
                        self.page = iframe
                        self.log(f"✓ 成功切换到iframe (尝试 {i+1}/{max_retries})")
                        return True
                except Exception as e:
                    if i < max_retries - 1:
                        self.log(f"未找到iframe，重试中... ({i+1}/{max_retries})")
                        time.sleep(1)
                    else:
                        self.log(f"所有重试都失败，未找到iframe")

            return False

        except Exception as e:
            self.log(f"切换到iframe时出错: {str(e)}")
            return False

    def switch_browse_type(self, browse_type: str, max_retries: int = 2) -> bool:
        """
        切换浏览类型(带重试机制)

        Args:
            browse_type: 浏览类型（注册前未读/应读/已读）
            max_retries: 最大重试次数(默认2次)

        Returns:
            是否切换成功
        """
        for attempt in range(max_retries + 1):
            try:
                if attempt > 0:
                    self.log(f"⚠ 第 {attempt + 1} 次尝试切换浏览类型...")
                else:
                    self.log(f"切换到'{browse_type}'类型...")

                # 切换到iframe
                if not self.switch_to_iframe():
                    if attempt < max_retries:
                        self.log(f"iframe切换失败,等待1秒后重试...")
                        time.sleep(1)
                        continue
                    return False

                # 方法1: 尝试查找<a>标签（如果JavaScript创建了的话）
                selector = f"//div[contains(@class, 'rule-multi-radio')]//a[contains(text(), '{browse_type}')]"

                try:
                    # 等待并点击
                    self.page.locator(selector).click(timeout=5000)
                    self.log(f"点击'{browse_type}'按钮成功")

                    # 等待页面刷新并加载内容
                    time.sleep(1.5)

                    # 等待表格加载（最多等待30秒）
                    try:
                        self.page.locator("//table[@class='ltable']").wait_for(timeout=30000)
                        self.log("内容表格已加载")
                    except Exception as e:
                        self.log("等待表格加载超时，继续...")

                    return True
                except Exception as e:
                    error_msg = str(e)
                    if "Execution context was destroyed" in error_msg:
                        self.log(f"⚠ 检测到执行上下文被销毁")
                        if attempt < max_retries:
                            self.log(f"等待2秒后重试...")
                            time.sleep(2)
                            continue
                    self.log(f"未找到<a>标签，尝试点击<label>...")

                # 方法2: 点击label（模拟点击radio button）
                label_selector = f"//label[contains(text(), '{browse_type}')]"

                try:
                    self.page.locator(label_selector).click(timeout=5000)
                    self.log(f"点击'{browse_type}'标签成功")

                    # 等待页面刷新并加载内容
                    time.sleep(1.5)

                    # 等待表格加载（最多等待30秒）
                    try:
                        self.page.locator("//table[@class='ltable']").wait_for(timeout=30000)
                        self.log("内容表格已加载")
                    except Exception as e:
                        self.log("等待表格加载超时，继续...")

                    return True
                except Exception as e:
                    error_msg = str(e)
                    if "Execution context was destroyed" in error_msg:
                        self.log(f"⚠ 检测到执行上下文被销毁")
                        if attempt < max_retries:
                            self.log(f"等待2秒后重试...")
                            time.sleep(2)
                            continue
                    self.log(f"未找到<label>标签")

                # 如果两种方法都失败，但还有重试机会
                if attempt < max_retries:
                    self.log(f"切换失败,等待2秒后重试...")
                    time.sleep(2)
                    continue

                return False

            except Exception as e:
                error_msg = str(e)
                self.log(f"切换浏览类型时出错: {error_msg}")

                # 检查是否是 "Execution context was destroyed" 错误
                if "Execution context was destroyed" in error_msg or "navigation" in error_msg.lower():
                    if attempt < max_retries:
                        self.log(f"⚠ 检测到执行上下文被销毁或导航错误,等待2秒后重试...")
                        time.sleep(2)
                        continue

                return False

        # 所有重试都失败
        self.log(f"❌ 切换浏览类型失败,已重试 {max_retries} 次")
        return False

    def browse_content(self, browse_type: str,
                      auto_next_page: bool = True,
                      auto_view_attachments: bool = True,
                      interval: float = 1.0,
                      should_stop_callback: Optional[Callable] = None) -> BrowseResult:
        """
        浏览内容

        Args:
            browse_type: 浏览类型
            auto_next_page: 是否自动翻页
            auto_view_attachments: 是否自动查看附件
            interval: 查看附件的间隔时间(秒)
            should_stop_callback: 检查是否应该停止的回调函数

        Returns:
            浏览结果
        """
        result = BrowseResult(success=False)

        try:
            # 先导航到浏览页面
            self.log(f"导航到 '{browse_type}' 页面...")
            try:
                # 等待页面完全加载
                time.sleep(2)
                self.log(f"当前URL: {self.main_page.url}")
            except Exception as e:
                self.log(f"获取URL失败: {str(e)}")

            # 切换浏览类型
            if not self.switch_browse_type(browse_type):
                result.error_message = "切换浏览类型失败"
                return result

            current_page = 1
            total_items = 0
            total_attachments = 0
            completed_first_round = False
            empty_page_counter = 0

            while True:
                # 检查是否应该停止
                if should_stop_callback and should_stop_callback():
                    self.log("收到停止信号，终止浏览")
                    break

                self.log(f"处理第 {current_page} 页...")

                # 确保在iframe中(关键!)
                time.sleep(0.2)
                self.page = self.main_page.frame('mainframe')
                if not self.page:
                    self.log("错误：无法获取iframe")
                    break

                # 额外等待，确保AJAX内容加载完成
                time.sleep(0.5)

                # 获取内容行数量
                rows_locator = self.page.locator("//table[@class='ltable']/tbody/tr[position()>1 and count(td)>=5]")
                rows_count = rows_locator.count()

                if rows_count == 0:
                    self.log("当前页面没有内容")
                    empty_page_counter += 1
                    self.log(f"连续空页面数: {empty_page_counter}")

                    # 检查是否已完成至少一轮浏览且连续空页面数达到阈值
                    if completed_first_round and empty_page_counter >= 2:
                        self.log("检测到连续空页面且已完成至少一轮浏览，内容已浏览完毕")
                        break

                    # 尝试翻页或返回第一页
                    if auto_next_page:
                        # 检查是否有下一页
                        try:
                            next_button = self.page.locator("//div[@id='PageContent']/a[contains(text(), '下一页') or contains(text(), '»')]")
                            if next_button.count() > 0:
                                self.log("点击下一页...")
                                next_button.click()
                                time.sleep(1.5)
                                current_page += 1
                                continue
                            else:
                                # 没有下一页，返回第一页
                                if not completed_first_round:
                                    completed_first_round = True
                                    self.log("完成第一轮浏览，准备返回第一页继续浏览...")
                                else:
                                    self.log("完成一轮浏览，返回第一页继续...")

                                # 刷新页面并重新点击浏览类型
                                self.log("刷新页面并重新点击浏览类型...")
                                self.main_page.reload()
                                time.sleep(1.5)

                                # 切换到iframe
                                time.sleep(0.5)
                                self.page = self.main_page.frame('mainframe')

                                # 重新点击浏览类型按钮
                                selector = f"//div[contains(@class, 'rule-multi-radio')]//a[contains(text(), '{browse_type}')]"
                                try:
                                    self.page.locator(selector).click(timeout=5000)
                                    self.log(f"重新点击'{browse_type}'按钮成功")
                                    time.sleep(1.5)

                                    # 等待表格加载
                                    try:
                                        self.page.locator("//table[@class='ltable']").wait_for(timeout=30000)  # 增加到30秒
                                        self.log("内容表格已加载")
                                    except Exception as e:
                                        self.log("等待表格加载超时，继续...")
                                except Exception as e:
                                    # 尝试点击label
                                    label_selector = f"//label[contains(text(), '{browse_type}')]"
                                    self.page.locator(label_selector).click(timeout=5000)
                                    self.log(f"点击'{browse_type}'标签成功")
                                    time.sleep(1.5)

                                current_page = 1
                                continue
                        except Exception as e:
                            self.log(f"翻页时出错: {str(e)}")
                            break
                    else:
                        break

                # 找到内容，重置空页面计数
                empty_page_counter = 0
                self.log(f"找到 {rows_count} 条内容")

                # 处理每一行 (每次从头重新获取所有行)
                for i in range(rows_count):
                    if should_stop_callback and should_stop_callback():
                        break

                    # 每次处理新行前,确保在iframe中(关键!尤其是history.back()后)
                    if i > 0:
                        time.sleep(0.2)
                        self.page = self.main_page.frame('mainframe')

                    # 每次都重新获取rows_locator和row,确保元素是最新的
                    current_rows_locator = self.page.locator("//table[@class='ltable']/tbody/tr[position()>1 and count(td)>=5]")
                    row = current_rows_locator.nth(i)

                    # 获取标题 (使用xpath:)
                    title_cell = row.locator("xpath=.//td[4]")
                    title = title_cell.inner_text().strip()
                    self.log(f"  [{i+1}] {title[:50]}")
                    total_items += 1

                    # 处理附件 (使用xpath:)
                    if auto_view_attachments:
                        # 每次都重新获取附件链接数量
                        att_links_locator = row.locator("xpath=.//td[5]//a[contains(@class, 'link-btn')]")
                        att_count = att_links_locator.count()

                        if att_count > 0:
                            # 只处理第一个附件
                            att_link = att_links_locator.first
                            att_text = att_link.inner_text().strip() or "附件"
                            self.log(f"    - 处理{att_text}...")

                            try:
                                # 记录点击前的页面数量
                                pages_before = len(self.context.pages)

                                # 点击附件
                                att_link.click()

                                # 快速检测是否有新窗口（0.5秒足够）
                                time.sleep(0.5)

                                # 检查是否有新窗口
                                pages_after = self.context.pages
                                if len(pages_after) > pages_before:
                                    # 有新窗口打开
                                    new_page = pages_after[-1]
                                    self.log(f"    - 新窗口已打开，等待加载...")
                                    time.sleep(interval)  # 使用用户设置的间隔

                                    # 关闭新窗口
                                    new_page.close()
                                    self.log(f"    - 新窗口已关闭")
                                else:
                                    # 没有新窗口，使用浏览器返回（像Selenium版本一样）
                                    # 关键问题：iframe内点击附件不会触发真正的导航
                                    # Selenium的driver.back()不等待，Playwright的go_back()会等待导航
                                    # 解决方案：使用JavaScript执行history.back()，不等待导航
                                    self.main_page.evaluate("() => window.history.back()")
                                    time.sleep(0.5)

                                    # 确保回到iframe中
                                    self.page = self.main_page.frame('mainframe')

                                # 确保回到iframe中
                                time.sleep(0.2)
                                self.page = self.main_page.frame('mainframe')

                                total_attachments += 1
                                self.log(f"    - {att_text}处理完成")

                            except Exception as e:
                                self.log(f"    - 处理{att_text}时出错: {str(e)}")
                                # 发生错误时尝试恢复到iframe
                                try:
                                    # 尝试重新获取iframe
                                    iframe = self.main_page.frame('mainframe')
                                    if iframe:
                                        self.page = iframe
                                    else:
                                        # 如果找不到iframe，可能需要刷新
                                        self.log(f"    - 找不到iframe，刷新页面...")
                                        self.main_page.reload()
                                        time.sleep(1)
                                        if self.switch_browse_type(browse_type):
                                            self.page = self.main_page.frame('mainframe')
                                except Exception as e:
                                    pass

                # 处理完当前页后，检查是否需要翻页
                if auto_next_page:
                    try:
                        # 确保在iframe中
                        time.sleep(0.2)
                        self.page = self.main_page.frame('mainframe')

                        # 检查是否有下一页
                        next_button = self.page.locator("//div[@id='PageContent']/a[contains(text(), '下一页') or contains(text(), '»')]")
                        if next_button.count() > 0:
                            self.log("点击下一页...")
                            next_button.click()
                            time.sleep(1.5)
                            current_page += 1
                            # 继续下一页的循环
                        else:
                            # 没有下一页了，返回第一页继续
                            if not completed_first_round:
                                completed_first_round = True
                                self.log("完成第一轮浏览，准备返回第一页继续浏览...")
                            else:
                                self.log("完成一轮浏览，返回第一页继续...")

                            # 刷新页面并重新点击浏览类型
                            self.log("刷新页面并重新点击浏览类型...")
                            self.main_page.reload()
                            time.sleep(1.5)

                            # 切换到iframe
                            time.sleep(0.5)
                            self.page = self.main_page.frame('mainframe')

                            # 重新点击浏览类型按钮
                            selector = f"//div[contains(@class, 'rule-multi-radio')]//a[contains(text(), '{browse_type}')]"
                            try:
                                self.page.locator(selector).click(timeout=5000)
                                self.log(f"重新点击'{browse_type}'按钮成功")
                                time.sleep(1.5)

                                # 等待表格加载
                                try:
                                    self.page.locator("//table[@class='ltable']").wait_for(timeout=30000)  # 增加到30秒
                                    self.log("内容表格已加载")
                                except Exception as e:
                                    self.log("等待表格加载超时，继续...")
                            except Exception as e:
                                # 尝试点击label
                                label_selector = f"//label[contains(text(), '{browse_type}')]"
                                self.page.locator(label_selector).click(timeout=5000)
                                self.log(f"点击'{browse_type}'标签成功")
                                time.sleep(1.5)

                            current_page = 1
                            # 继续循环，从第一页开始
                    except Exception as e:
                        self.log(f"翻页时出错: {str(e)}")
                        break

            result.success = True
            result.total_items = total_items
            result.total_attachments = total_attachments
            self.log(f"浏览完成！共 {total_items} 条内容，{total_attachments} 个附件")

        except Exception as e:
            result.error_message = str(e)
            self.log(f"浏览内容时出错: {str(e)}")

        return result

    def take_screenshot(self, filepath: str) -> bool:
        """
        截图

        Args:
            filepath: 截图保存路径

        Returns:
            是否截图成功
        """
        try:
            # 使用最高质量设置截图
            # type='jpeg' 指定JPEG格式（支持quality参数）
            # quality=100 表示100%的JPEG质量（范围0-100，最高质量）
            # full_page=True 表示截取整个页面
            # 视口分辨率 2560x1440 确保高清晰度
            # 这样可以生成更清晰的截图，大小约500KB-1MB左右
            self.main_page.screenshot(
                path=filepath,
                type='jpeg',
                full_page=True,
                quality=100
            )
            self.log(f"截图已保存: {filepath}")
            return True
        except Exception as e:
            self.log(f"截图失败: {str(e)}")
            return False

    def close(self):
        """完全关闭浏览器进程(每个账号独立)并确保资源释放"""
        errors = []

        # 第一步:关闭上下文
        if self.context:
            try:
                self.context.close()
                self.log("上下文已关闭")
            except Exception as e:
                error_msg = f"关闭上下文时出错: {str(e)}"
                self.log(error_msg)
                errors.append(error_msg)

        # 第二步:关闭浏览器进程
        if self.browser:
            try:
                self.browser.close()
                self.log("浏览器进程已关闭")
            except Exception as e:
                error_msg = f"关闭浏览器时出错: {str(e)}"
                self.log(error_msg)
                errors.append(error_msg)

        # 第三步:停止Playwright
        if self.playwright:
            try:
                self.playwright.stop()
                self.log("Playwright已停止")
            except Exception as e:
                error_msg = f"停止Playwright时出错: {str(e)}"
                self.log(error_msg)
                errors.append(error_msg)

        # 第四步:清空引用,确保垃圾回收
        self.context = None
        self.page = None
        self.main_page = None
        self.browser = None
        self.playwright = None

        # 第五步:强制等待,确保进程完全退出
        time.sleep(0.5)

        if errors:
            self.log(f"资源清理完成,但有{len(errors)}个警告")
        else:
            self.log("资源清理完成")


# 简单的测试函数
if __name__ == "__main__":
    print("Playwright自动化核心 - 测试")
    print("="*60)

    # 创建浏览器管理器
    manager = PlaywrightBrowserManager(headless=True)

    try:
        # 初始化浏览器
        manager.initialize()

        # 创建自动化实例
        automation = PlaywrightAutomation(manager, "test_account")

        # 登录
        if automation.login("19174616018", "aa123456"):
            # 浏览内容
            result = automation.browse_content(
                browse_type="应读",
                auto_next_page=True,
                auto_view_attachments=True,
                interval=2.0  # 增加间隔时间
            )

            print(f"\n浏览结果: {result}")

        # 关闭
        automation.close()

    finally:
        # 关闭浏览器管理器
        manager.close()

    print("="*60)
    print("测试完成")