zsglpt/api_browser.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
API 浏览器 - 用纯 HTTP 请求实现浏览功能
比 Playwright 快 30-60 倍
"""

import requests
from bs4 import BeautifulSoup
import re
import time
import atexit
import weakref
from typing import Optional, Callable
from dataclasses import dataclass
from urllib.parse import urlsplit

from app_config import get_config

config = get_config()

BASE_URL = getattr(config, "ZSGL_BASE_URL", "https://postoa.aidunsoft.com")
LOGIN_URL = getattr(config, "ZSGL_LOGIN_URL", f"{BASE_URL}/admin/login.aspx")
INDEX_URL_PATTERN = getattr(config, "ZSGL_INDEX_URL_PATTERN", "index.aspx")
COOKIES_DIR = getattr(config, "COOKIES_DIR", "data/cookies")

_cookie_domain_fallback = urlsplit(BASE_URL).hostname or "postoa.aidunsoft.com"

_api_browser_instances: "weakref.WeakSet[APIBrowser]" = weakref.WeakSet()


def _cleanup_api_browser_instances():
    """进程退出时清理残留的API浏览器实例（弱引用，不阻止GC）"""
    for inst in list(_api_browser_instances):
        try:
            inst.close()
        except Exception:
            pass


atexit.register(_cleanup_api_browser_instances)


@dataclass
class APIBrowseResult:
    """API 浏览结果"""
    success: bool
    total_items: int = 0
    total_attachments: int = 0
    error_message: str = ""


class APIBrowser:
    """API 浏览器 - 使用纯 HTTP 请求实现浏览"""

    def __init__(self, log_callback: Optional[Callable] = None, proxy_config: Optional[dict] = None):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        })
        self.logged_in = False
        self.log_callback = log_callback
        self.stop_flag = False
        self._closed = False  # 防止重复关闭
        self.last_total_records = 0

        # 设置代理
        if proxy_config and proxy_config.get("server"):
            proxy_server = proxy_config["server"]
            self.session.proxies = {
                "http": proxy_server,
                "https": proxy_server
            }
            self.proxy_server = proxy_server
        else:
            self.proxy_server = None

        _api_browser_instances.add(self)

    def log(self, message: str):
        """记录日志"""
        if self.log_callback:
            self.log_callback(message)
    def save_cookies_for_playwright(self, username: str):
        """保存cookies供Playwright使用"""
        import os
        import json
        import hashlib

        os.makedirs(COOKIES_DIR, exist_ok=True)

        # 安全修复：使用SHA256代替MD5作为文件名哈希
        filename = hashlib.sha256(username.encode()).hexdigest()[:32] + '.json'
        cookies_path = os.path.join(COOKIES_DIR, filename)

        try:
            # 获取requests session的cookies
            cookies_list = []
            for cookie in self.session.cookies:
                cookies_list.append({
                    'name': cookie.name,
                    'value': cookie.value,
                    'domain': cookie.domain or _cookie_domain_fallback,
                    'path': cookie.path or '/',
                })

            # Playwright storage_state 格式
            storage_state = {
                'cookies': cookies_list,
                'origins': []
            }

            with open(cookies_path, 'w', encoding='utf-8') as f:
                json.dump(storage_state, f)

            self.log(f"[API] Cookies已保存供截图使用")
            return True
        except Exception as e:
            self.log(f"[API] 保存cookies失败: {e}")
            return False


    def _request_with_retry(self, method, url, max_retries=3, retry_delay=1, **kwargs):
        """带重试机制的请求方法"""
        kwargs.setdefault('timeout', 5)
        last_error = None

        for attempt in range(1, max_retries + 1):
            try:
                if method.lower() == 'get':
                    resp = self.session.get(url, **kwargs)
                else:
                    resp = self.session.post(url, **kwargs)
                return resp
            except Exception as e:
                last_error = e
                if attempt < max_retries:
                    self.log(f"[API] 请求超时，{retry_delay}秒后重试 ({attempt}/{max_retries})...")
                    import time
                    time.sleep(retry_delay)
                else:
                    self.log(f"[API] 请求失败，已重试{max_retries}次: {str(e)}")

        raise last_error

    def _get_aspnet_fields(self, soup):
        """获取 ASP.NET 隐藏字段"""
        fields = {}
        for name in ['__VIEWSTATE', '__VIEWSTATEGENERATOR', '__EVENTVALIDATION']:
            field = soup.find('input', {'name': name})
            if field:
                fields[name] = field.get('value', '')
        return fields

    def get_real_name(self) -> Optional[str]:
        """
        获取用户真实姓名
        从 center.aspx 页面解析姓名信息
        返回: 姓名字符串，失败返回 None
        """
        if not self.logged_in:
            return None

        try:
            url = f"{BASE_URL}/admin/center.aspx"
            resp = self._request_with_retry('get', url)
            soup = BeautifulSoup(resp.text, 'html.parser')

            # 查找包含"姓名："的元素
            # 页面格式: <li><p>姓名：喻勇祥(19174616018) 人力资源编码: ...</p></li>
            nlist = soup.find('div', {'class': 'nlist-5'})
            if nlist:
                first_li = nlist.find('li')
                if first_li:
                    text = first_li.get_text()
                    # 解析姓名：格式为 "姓名：XXX(手机号)"
                    match = re.search(r'姓名[：:]\s*([^\(（]+)', text)
                    if match:
                        real_name = match.group(1).strip()
                        if real_name:
                            return real_name
            return None
        except Exception as e:
            return None

    def login(self, username: str, password: str) -> bool:
        """登录"""
        self.log(f"[API] 登录: {username}")

        try:
            resp = self._request_with_retry('get', LOGIN_URL)

            soup = BeautifulSoup(resp.text, 'html.parser')
            fields = self._get_aspnet_fields(soup)

            data = fields.copy()
            data['txtUserName'] = username
            data['txtPassword'] = password
            data['btnSubmit'] = '登 录'

            resp = self._request_with_retry(
                'post',
                LOGIN_URL,
                data=data,
                headers={
                    'Content-Type': 'application/x-www-form-urlencoded',
                    'Origin': BASE_URL,
                    'Referer': LOGIN_URL,
                },
                allow_redirects=True
            )

            if INDEX_URL_PATTERN in resp.url:
                self.logged_in = True
                self.log(f"[API] 登录成功")
                return True
            else:
                soup = BeautifulSoup(resp.text, 'html.parser')
                error = soup.find(id='lblMsg')
                error_msg = error.get_text().strip() if error else '未知错误'
                self.log(f"[API] 登录失败: {error_msg}")
                return False

        except Exception as e:
            self.log(f"[API] 登录异常: {str(e)}")
            return False

    def get_article_list_page(self, bz: int = 2, page: int = 1, base_url: str = None):
        """获取单页文章列表"""
        if not self.logged_in:
            return [], 0, None

        if base_url and page > 1:
            url = re.sub(r'page=\d+', f'page={page}', base_url)
        elif page > 1:
            # 兼容兜底：若没有 next_url（极少数情况下页面不提供“下一页”链接），尝试直接拼 page 参数
            url = f"{BASE_URL}/admin/center.aspx?bz={bz}&page={page}"
        else:
            url = f"{BASE_URL}/admin/center.aspx?bz={bz}"

        resp = self._request_with_retry('get', url)
        soup = BeautifulSoup(resp.text, 'html.parser')
        articles = []

        ltable = soup.find('table', {'class': 'ltable'})
        if ltable:
            rows = ltable.find_all('tr')[1:]
            for row in rows:
                # 检查是否是"暂无记录"
                if '暂无记录' in row.get_text():
                    continue

                link = row.find('a', href=True)
                if link:
                    href = link.get('href', '')
                    title = link.get_text().strip()

                    match = re.search(r'id=(\d+)', href)
                    article_id = match.group(1) if match else None

                    articles.append({
                        'title': title,
                        'href': href,
                        'article_id': article_id,
                    })

        # 获取总页数
        total_pages = 1
        next_page_url = None
        total_records = 0

        page_content = soup.find(id='PageContent')
        if page_content:
            text = page_content.get_text()
            total_match = re.search(r'共(\d+)记录', text)
            if total_match:
                total_records = int(total_match.group(1))
                total_pages = (total_records + 9) // 10

            next_link = page_content.find('a', string=re.compile('下一页'))
            if next_link:
                next_href = next_link.get('href', '')
                if next_href:
                    next_page_url = f"{BASE_URL}/admin/{next_href}"

        try:
            self.last_total_records = int(total_records or 0)
        except Exception:
            self.last_total_records = 0
        return articles, total_pages, next_page_url

    def get_article_attachments(self, article_href: str):
        """获取文章的附件列表"""
        if not article_href.startswith('http'):
            url = f"{BASE_URL}/admin/{article_href}"
        else:
            url = article_href

        resp = self._request_with_retry('get', url)
        soup = BeautifulSoup(resp.text, 'html.parser')

        attachments = []

        attach_list = soup.find('div', {'class': 'attach-list2'})
        if attach_list:
            items = attach_list.find_all('li')
            for item in items:
                download_links = item.find_all('a', onclick=re.compile(r'download\.ashx'))
                for link in download_links:
                    onclick = link.get('onclick', '')
                    id_match = re.search(r'id=(\d+)', onclick)
                    channel_match = re.search(r'channel_id=(\d+)', onclick)
                    if id_match:
                        attach_id = id_match.group(1)
                        channel_id = channel_match.group(1) if channel_match else '1'
                        h3 = item.find('h3')
                        filename = h3.get_text().strip() if h3 else f'附件{attach_id}'
                        attachments.append({
                            'id': attach_id,
                            'channel_id': channel_id,
                            'filename': filename
                        })
                        break

        return attachments

    def mark_read(self, attach_id: str, channel_id: str = '1') -> bool:
        """通过访问下载链接标记已读"""
        download_url = f"{BASE_URL}/tools/download.ashx?site=main&id={attach_id}&channel_id={channel_id}"

        try:
            resp = self._request_with_retry("get", download_url, stream=True)
            resp.close()
            return resp.status_code == 200
        except:
            return False

    def browse_content(
        self,
        browse_type: str,
        should_stop_callback: Optional[Callable] = None,
        progress_callback: Optional[Callable] = None,
    ) -> APIBrowseResult:
        """
        浏览内容并标记已读

        Args:
            browse_type: 浏览类型 (应读/注册前未读)
            should_stop_callback: 检查是否应该停止的回调函数
            progress_callback: 进度回调（可选），用于实时上报已浏览内容数量

        Returns:
            浏览结果
        """
        result = APIBrowseResult(success=False)

        if not self.logged_in:
            result.error_message = "未登录"
            return result

        # 根据浏览类型确定 bz 参数
        # 网页实际参数: 0=注册前未读, 2=应读（历史上曾存在 1=已读，但当前逻辑不再使用）
        # 当前前端选项: 注册前未读、应读（默认应读）
        browse_type_text = str(browse_type or "")
        if '注册前' in browse_type_text:
            bz = 0  # 注册前未读
        else:
            bz = 2  # 应读

        self.log(f"[API] 开始浏览 '{browse_type}' (bz={bz})...")

        try:
            total_items = 0
            total_attachments = 0
            page = 1
            base_url = None
            skipped_items = 0
            skipped_pages = 0
            consecutive_failures = 0
            max_consecutive_failures = 3

            # 获取第一页
            try:
                articles, total_pages, next_url = self.get_article_list_page(bz, page)
                consecutive_failures = 0
            except Exception as e:
                result.error_message = str(e)
                self.log(f"[API] 获取第1页列表失败: {str(e)}")
                return result

            if not articles:
                self.log(f"[API] '{browse_type}' 没有待处理内容")
                result.success = True
                return result

            self.log(f"[API] 共 {total_pages} 页，开始处理...")

            if next_url:
                base_url = next_url
            elif total_pages > 1:
                base_url = f"{BASE_URL}/admin/center.aspx?bz={bz}&page=2"

            total_records = int(getattr(self, "last_total_records", 0) or 0)
            last_report_ts = 0.0

            def report_progress(force: bool = False):
                nonlocal last_report_ts
                if not progress_callback:
                    return
                now_ts = time.time()
                if not force and now_ts - last_report_ts < 1.0:
                    return
                last_report_ts = now_ts
                try:
                    progress_callback({"total_items": total_records, "browsed_items": total_items})
                except Exception:
                    pass

            report_progress(force=True)

            # 处理所有页面
            while page <= total_pages:
                if should_stop_callback and should_stop_callback():
                    self.log("[API] 收到停止信号")
                    break

                # page==1 已取过，后续页在这里获取
                if page > 1:
                    try:
                        articles, _, next_url = self.get_article_list_page(bz, page, base_url)
                        consecutive_failures = 0
                        if next_url:
                            base_url = next_url
                    except Exception as e:
                        skipped_pages += 1
                        consecutive_failures += 1
                        self.log(
                            f"[API] 获取第{page}页列表失败，跳过本页（连续失败{consecutive_failures}/{max_consecutive_failures}）: {str(e)}"
                        )
                        if consecutive_failures >= max_consecutive_failures:
                            raise
                        page += 1
                        continue

                for article in articles:
                    if should_stop_callback and should_stop_callback():
                        break

                    title = article['title'][:30]
                    # 获取附件（文章详情页）
                    try:
                        attachments = self.get_article_attachments(article['href'])
                        consecutive_failures = 0
                    except Exception as e:
                        skipped_items += 1
                        consecutive_failures += 1
                        self.log(
                            f"[API] 获取文章失败，跳过（连续失败{consecutive_failures}/{max_consecutive_failures}）: {title} | {str(e)}"
                        )
                        if consecutive_failures >= max_consecutive_failures:
                            raise
                        continue

                    total_items += 1
                    report_progress()

                    if attachments:
                        for attach in attachments:
                            if self.mark_read(attach['id'], attach['channel_id']):
                                total_attachments += 1

                        self.log(f"[API] [{total_items}] {title} - {len(attachments)}个附件")

                    time.sleep(0.1)

                page += 1
                time.sleep(0.2)

            report_progress(force=True)
            if skipped_items or skipped_pages:
                self.log(
                    f"[API] 浏览完成: {total_items} 条内容，{total_attachments} 个附件（跳过 {skipped_items} 条内容，{skipped_pages} 页列表）"
                )
            else:
                self.log(f"[API] 浏览完成: {total_items} 条内容，{total_attachments} 个附件")

            result.success = True
            result.total_items = total_items
            result.total_attachments = total_attachments
            return result

        except Exception as e:
            result.error_message = str(e)
            self.log(f"[API] 浏览出错: {str(e)}")
            return result

    def close(self):
        """关闭会话"""
        if self._closed:
            return
        self._closed = True

        try:
            self.session.close()
        except:
            pass
        finally:
            try:
                _api_browser_instances.discard(self)
            except Exception:
                pass

    def __enter__(self):
        """Context manager支持 - 进入"""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager支持 - 退出"""
        self.close()
        return False  # 不抑制异常