zsglpt-pc/core/api_browser.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
API浏览器 - 精简版
用纯HTTP请求实现浏览功能，比浏览器自动化快30-60倍
从原项目精简提取，移除了缓存、诊断日志等复杂功能
"""

import os
import re
import time
import hashlib
from typing import Optional, Callable, List, Dict, Any
from dataclasses import dataclass
from urllib.parse import urlsplit

import requests
from bs4 import BeautifulSoup


@dataclass
class APIBrowseResult:
    """API浏览结果"""
    success: bool
    total_items: int = 0
    total_attachments: int = 0
    error_message: str = ""


def get_cookie_jar_path(username: str) -> str:
    """获取截图用的cookies文件路径（Netscape Cookie格式）"""
    from config import COOKIES_DIR

    COOKIES_DIR.mkdir(exist_ok=True)
    filename = hashlib.sha256(username.encode()).hexdigest()[:32] + ".cookies.txt"
    return str(COOKIES_DIR / filename)


def is_cookie_jar_fresh(cookie_path: str, max_age_seconds: int = 86400) -> bool:
    """判断cookies文件是否存在且未过期（默认24小时）"""
    if not cookie_path or not os.path.exists(cookie_path):
        return False
    try:
        file_age = time.time() - os.path.getmtime(cookie_path)
        return file_age <= max(0, int(max_age_seconds or 0))
    except Exception:
        return False


class APIBrowser:
    """
    API浏览器 - 使用纯HTTP请求实现浏览

    用法:
        with APIBrowser(log_callback=print) as browser:
            if browser.login(username, password):
                result = browser.browse_content("应读")
    """

    def __init__(self, log_callback: Optional[Callable] = None, proxy_config: Optional[dict] = None):
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
        })
        self.logged_in = False
        self.log_callback = log_callback
        self.stop_flag = False
        self._closed = False
        self.last_total_records = 0
        self._username = ""

        # 获取配置
        from config import get_config
        config = get_config()
        self.base_url = config.zsgl.base_url
        self.login_url = config.zsgl.login_url
        self.index_url_pattern = config.zsgl.index_url_pattern

        # 设置代理
        if proxy_config and proxy_config.get("server"):
            proxy_server = proxy_config["server"]
            self.session.proxies = {"http": proxy_server, "https": proxy_server}
            self.proxy_server = proxy_server
        else:
            self.proxy_server = None

    def log(self, message: str):
        """记录日志"""
        if self.log_callback:
            self.log_callback(message)

    def _request_with_retry(self, method: str, url: str, max_retries: int = 3,
                            retry_delay: float = 1, **kwargs) -> requests.Response:
        """带重试机制的请求方法"""
        kwargs.setdefault("timeout", 10.0)
        last_error = None

        for attempt in range(1, max_retries + 1):
            try:
                if method.lower() == "get":
                    resp = self.session.get(url, **kwargs)
                else:
                    resp = self.session.post(url, **kwargs)
                return resp
            except Exception as e:
                last_error = e
                if attempt < max_retries:
                    self.log(f" 请求超时，{retry_delay}秒后重试 ({attempt}/{max_retries})...")
                    time.sleep(retry_delay)
                else:
                    self.log(f" 请求失败，已重试{max_retries}次: {str(e)}")

        raise last_error

    def _get_aspnet_fields(self, soup: BeautifulSoup) -> Dict[str, str]:
        """获取ASP.NET隐藏字段"""
        fields = {}
        for name in ["__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION"]:
            field = soup.find("input", {"name": name})
            if field:
                fields[name] = field.get("value", "")
        return fields

    def login(self, username: str, password: str) -> bool:
        """登录"""
        self.log(f" 登录: {username}")
        self._username = username

        try:
            resp = self._request_with_retry("get", self.login_url)
            soup = BeautifulSoup(resp.text, "html.parser")
            fields = self._get_aspnet_fields(soup)

            data = fields.copy()
            data["txtUserName"] = username
            data["txtPassword"] = password
            data["btnSubmit"] = "登 录"

            resp = self._request_with_retry(
                "post",
                self.login_url,
                data=data,
                headers={
                    "Content-Type": "application/x-www-form-urlencoded",
                    "Origin": self.base_url,
                    "Referer": self.login_url,
                },
                allow_redirects=True,
            )

            if self.index_url_pattern in resp.url:
                self.logged_in = True
                self.log(f" 登录成功")
                return True
            else:
                soup = BeautifulSoup(resp.text, "html.parser")
                error = soup.find(id="lblMsg")
                error_msg = error.get_text().strip() if error else "未知错误"
                self.log(f" 登录失败: {error_msg}")
                return False

        except Exception as e:
            self.log(f" 登录异常: {str(e)}")
            return False

    def get_real_name(self) -> Optional[str]:
        """获取用户真实姓名"""
        if not self.logged_in:
            return None

        try:
            url = f"{self.base_url}/admin/center.aspx"
            resp = self._request_with_retry("get", url)
            soup = BeautifulSoup(resp.text, "html.parser")

            nlist = soup.find("div", {"class": "nlist-5"})
            if nlist:
                first_li = nlist.find("li")
                if first_li:
                    text = first_li.get_text()
                    match = re.search(r"姓名[：:]\s*([^\(（]+)", text)
                    if match:
                        return match.group(1).strip()
            return None
        except Exception:
            return None

    def save_cookies_for_screenshot(self, username: str) -> bool:
        """保存cookies供wkhtmltoimage使用（Netscape Cookie格式）"""
        cookies_path = get_cookie_jar_path(username)
        try:
            parsed = urlsplit(self.base_url)
            cookie_domain = parsed.hostname or "postoa.aidunsoft.com"

            lines = [
                "# Netscape HTTP Cookie File",
                "# Generated by zsglpt-lite",
            ]
            for cookie in self.session.cookies:
                domain = cookie.domain or cookie_domain
                include_subdomains = "TRUE" if domain.startswith(".") else "FALSE"
                path = cookie.path or "/"
                secure = "TRUE" if getattr(cookie, "secure", False) else "FALSE"
                expires = int(getattr(cookie, "expires", 0) or 0)
                lines.append("\t".join([
                    domain,
                    include_subdomains,
                    path,
                    secure,
                    str(expires),
                    cookie.name,
                    cookie.value,
                ]))

            with open(cookies_path, "w", encoding="utf-8") as f:
                f.write("\n".join(lines) + "\n")

            self.log(f" Cookies已保存供截图使用")
            return True
        except Exception as e:
            self.log(f" 保存cookies失败: {e}")
            return False

    def get_article_list_page(self, bz: int = 0, page: int = 1) -> tuple:
        """获取单页文章列表"""
        if not self.logged_in:
            return [], 0, None

        if page > 1:
            url = f"{self.base_url}/admin/center.aspx?bz={bz}&page={page}"
        else:
            url = f"{self.base_url}/admin/center.aspx?bz={bz}"

        resp = self._request_with_retry("get", url)
        soup = BeautifulSoup(resp.text, "html.parser")
        articles = []

        ltable = soup.find("table", {"class": "ltable"})
        if ltable:
            rows = ltable.find_all("tr")[1:]
            for row in rows:
                if "暂无记录" in row.get_text():
                    continue

                link = row.find("a", href=True)
                if link:
                    href = link.get("href", "")
                    title = link.get_text().strip()
                    match = re.search(r"id=(\d+)", href)
                    article_id = match.group(1) if match else None
                    articles.append({
                        "title": title,
                        "href": href,
                        "article_id": article_id,
                    })

        # 获取总页数
        total_pages = 1
        total_records = 0

        page_content = soup.find(id="PageContent")
        if page_content:
            text = page_content.get_text()
            total_match = re.search(r"共(\d+)记录", text)
            if total_match:
                total_records = int(total_match.group(1))
                total_pages = (total_records + 9) // 10

        self.last_total_records = total_records
        return articles, total_pages, None

    def get_article_attachments(self, article_href: str) -> tuple:
        """获取文章的附件列表和文章信息"""
        if not article_href.startswith("http"):
            url = f"{self.base_url}/admin/{article_href}"
        else:
            url = article_href

        resp = self._request_with_retry("get", url)
        soup = BeautifulSoup(resp.text, "html.parser")

        attachments = []
        article_info = {"channel_id": None, "article_id": None}

        # 从saveread按钮获取channel_id和article_id
        for elem in soup.find_all(["button", "input"]):
            onclick = elem.get("onclick", "")
            match = re.search(r"saveread\((\d+),(\d+)\)", onclick)
            if match:
                article_info["channel_id"] = match.group(1)
                article_info["article_id"] = match.group(2)
                break

        attach_list = soup.find("div", {"class": "attach-list2"})
        if attach_list:
            items = attach_list.find_all("li")
            for item in items:
                download_links = item.find_all("a", onclick=re.compile(r"download2?\.ashx"))
                for link in download_links:
                    onclick = link.get("onclick", "")
                    id_match = re.search(r"id=(\d+)", onclick)
                    channel_match = re.search(r"channel_id=(\d+)", onclick)
                    if id_match:
                        attach_id = id_match.group(1)
                        channel_id = channel_match.group(1) if channel_match else "1"
                        h3 = item.find("h3")
                        filename = h3.get_text().strip() if h3 else f"附件{attach_id}"
                        attachments.append({
                            "id": attach_id,
                            "channel_id": channel_id,
                            "filename": filename
                        })
                        break

        return attachments, article_info

    def mark_article_read(self, channel_id: str, article_id: str) -> bool:
        """通过saveread API标记文章已读"""
        if not channel_id or not article_id:
            return False

        import random
        saveread_url = (
            f"{self.base_url}/tools/submit_ajax.ashx?action=saveread"
            f"&time={random.random()}&fl={channel_id}&id={article_id}"
        )

        try:
            resp = self._request_with_retry("post", saveread_url)
            if resp.status_code == 200:
                try:
                    data = resp.json()
                    return data.get("status") == 1
                except:
                    return True
            return False
        except:
            return False

    def mark_attachment_read(self, attach_id: str, channel_id: str = "1") -> bool:
        """通过访问预览通道标记附件已读"""
        download_url = f"{self.base_url}/tools/download2.ashx?site=main&id={attach_id}&channel_id={channel_id}"

        try:
            resp = self._request_with_retry("get", download_url, stream=True)
            resp.close()
            return resp.status_code == 200
        except:
            return False

    def browse_content(
        self,
        browse_type: str,
        should_stop_callback: Optional[Callable] = None,
        progress_callback: Optional[Callable] = None,
    ) -> APIBrowseResult:
        """
        浏览内容并标记已读

        Args:
            browse_type: 浏览类型 (应读/注册前未读)
            should_stop_callback: 检查是否应该停止的回调函数
            progress_callback: 进度回调，用于实时上报已浏览内容数量
                              回调参数: {"total_items": int, "browsed_items": int}

        Returns:
            浏览结果
        """
        result = APIBrowseResult(success=False)

        if not self.logged_in:
            result.error_message = "未登录"
            return result

        # 根据浏览类型确定bz参数（网站更新后 bz=0 为应读）
        bz = 0

        self.log(f" 开始浏览 '{browse_type}' (bz={bz})...")

        try:
            total_items = 0
            total_attachments = 0

            # 获取第一页
            articles, total_pages, _ = self.get_article_list_page(bz, 1)

            if not articles:
                self.log(f" '{browse_type}' 没有待处理内容")
                result.success = True
                return result

            total_records = self.last_total_records
            self.log(f" 共 {total_records} 条记录，开始处理...")

            # 上报初始进度
            if progress_callback:
                progress_callback({"total_items": total_records, "browsed_items": 0})

            processed_hrefs = set()
            current_page = 1
            max_iterations = total_records + 20

            for iteration in range(max_iterations):
                if should_stop_callback and should_stop_callback():
                    self.log(" 收到停止信号")
                    break

                if not articles:
                    break

                new_articles_in_page = 0

                for article in articles:
                    if should_stop_callback and should_stop_callback():
                        break

                    article_href = article["href"]
                    if article_href in processed_hrefs:
                        continue

                    processed_hrefs.add(article_href)
                    new_articles_in_page += 1
                    title = article["title"][:30]

                    # 获取附件和文章信息
                    try:
                        attachments, article_info = self.get_article_attachments(article_href)
                    except Exception as e:
                        self.log(f" 获取文章失败: {title} | {str(e)}")
                        continue

                    total_items += 1

                    # 标记文章已读
                    article_marked = False
                    if article_info.get("channel_id") and article_info.get("article_id"):
                        article_marked = self.mark_article_read(
                            article_info["channel_id"],
                            article_info["article_id"]
                        )

                    # 处理附件
                    if attachments:
                        for attach in attachments:
                            if self.mark_attachment_read(attach["id"], attach["channel_id"]):
                                total_attachments += 1
                        self.log(f" [{total_items}] {title} - {len(attachments)}个附件")
                    else:
                        status = "已标记" if article_marked else "标记失败"
                        self.log(f" [{total_items}] {title} - 无附件({status})")

                    # 上报进度
                    if progress_callback:
                        progress_callback({"total_items": total_records, "browsed_items": total_items})

                    # 简单延迟，避免请求太快
                    time.sleep(0.05)

                # 决定下一步
                if new_articles_in_page > 0:
                    current_page = 1
                else:
                    current_page += 1
                    if current_page > total_pages:
                        break

                # 获取下一页
                try:
                    articles, new_total_pages, _ = self.get_article_list_page(bz, current_page)
                    if new_total_pages > 0:
                        total_pages = new_total_pages
                except Exception as e:
                    self.log(f" 获取第{current_page}页列表失败: {str(e)}")
                    break

            self.log(f" 浏览完成: {total_items} 条内容，{total_attachments} 个附件")
            result.success = True
            result.total_items = total_items
            result.total_attachments = total_attachments
            return result

        except Exception as e:
            result.error_message = str(e)
            self.log(f" 浏览出错: {str(e)}")
            return result

    def close(self):
        """关闭会话"""
        if self._closed:
            return
        self._closed = True
        try:
            self.session.close()
        except:
            pass

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()
        return False