feat: 知识管理平台精简版 - PyQt6桌面应用

主要功能: - 账号管理：添加/编辑/删除账号，测试登录 - 浏览任务：批量浏览应读/选读内容并标记已读 - 截图管理：wkhtmltoimage截图，查看历史 - 金山文档：扫码登录/微信快捷登录，自动上传截图技术栈: - PyQt6 GUI框架 - Playwright 浏览器自动化 - SQLite 本地数据存储 - wkhtmltoimage 网页截图 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-18 22:16:36 +08:00
commit 83fef6dff2
24 changed files with 6133 additions and 0 deletions
--- a/core/api_browser.py
+++ b/core/api_browser.py
@@ -0,0 +1,504 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+API浏览器 - 精简版
+用纯HTTP请求实现浏览功能，比浏览器自动化快30-60倍
+从原项目精简提取，移除了缓存、诊断日志等复杂功能
+"""
+
+import os
+import re
+import time
+import hashlib
+from typing import Optional, Callable, List, Dict, Any
+from dataclasses import dataclass
+from urllib.parse import urlsplit
+
+import requests
+from bs4 import BeautifulSoup
+
+
+@dataclass
+class APIBrowseResult:
+    """API浏览结果"""
+    success: bool
+    total_items: int = 0
+    total_attachments: int = 0
+    error_message: str = ""
+
+
+def get_cookie_jar_path(username: str) -> str:
+    """获取截图用的cookies文件路径（Netscape Cookie格式）"""
+    from config import COOKIES_DIR
+
+    COOKIES_DIR.mkdir(exist_ok=True)
+    filename = hashlib.sha256(username.encode()).hexdigest()[:32] + ".cookies.txt"
+    return str(COOKIES_DIR / filename)
+
+
+def is_cookie_jar_fresh(cookie_path: str, max_age_seconds: int = 86400) -> bool:
+    """判断cookies文件是否存在且未过期（默认24小时）"""
+    if not cookie_path or not os.path.exists(cookie_path):
+        return False
+    try:
+        file_age = time.time() - os.path.getmtime(cookie_path)
+        return file_age <= max(0, int(max_age_seconds or 0))
+    except Exception:
+        return False
+
+
+class APIBrowser:
+    """
+    API浏览器 - 使用纯HTTP请求实现浏览
+
+    用法:
+        with APIBrowser(log_callback=print) as browser:
+            if browser.login(username, password):
+                result = browser.browse_content("应读")
+    """
+
+    def __init__(self, log_callback: Optional[Callable] = None, proxy_config: Optional[dict] = None):
+        self.session = requests.Session()
+        self.session.headers.update({
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+        })
+        self.logged_in = False
+        self.log_callback = log_callback
+        self.stop_flag = False
+        self._closed = False
+        self.last_total_records = 0
+        self._username = ""
+
+        # 获取配置
+        from config import get_config
+        config = get_config()
+        self.base_url = config.zsgl.base_url
+        self.login_url = config.zsgl.login_url
+        self.index_url_pattern = config.zsgl.index_url_pattern
+
+        # 设置代理
+        if proxy_config and proxy_config.get("server"):
+            proxy_server = proxy_config["server"]
+            self.session.proxies = {"http": proxy_server, "https": proxy_server}
+            self.proxy_server = proxy_server
+        else:
+            self.proxy_server = None
+
+    def log(self, message: str):
+        """记录日志"""
+        if self.log_callback:
+            self.log_callback(message)
+
+    def _request_with_retry(self, method: str, url: str, max_retries: int = 3,
+                            retry_delay: float = 1, **kwargs) -> requests.Response:
+        """带重试机制的请求方法"""
+        kwargs.setdefault("timeout", 10.0)
+        last_error = None
+
+        for attempt in range(1, max_retries + 1):
+            try:
+                if method.lower() == "get":
+                    resp = self.session.get(url, **kwargs)
+                else:
+                    resp = self.session.post(url, **kwargs)
+                return resp
+            except Exception as e:
+                last_error = e
+                if attempt < max_retries:
+                    self.log(f"[API] 请求超时，{retry_delay}秒后重试 ({attempt}/{max_retries})...")
+                    time.sleep(retry_delay)
+                else:
+                    self.log(f"[API] 请求失败，已重试{max_retries}次: {str(e)}")
+
+        raise last_error
+
+    def _get_aspnet_fields(self, soup: BeautifulSoup) -> Dict[str, str]:
+        """获取ASP.NET隐藏字段"""
+        fields = {}
+        for name in ["__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION"]:
+            field = soup.find("input", {"name": name})
+            if field:
+                fields[name] = field.get("value", "")
+        return fields
+
+    def login(self, username: str, password: str) -> bool:
+        """登录"""
+        self.log(f"[API] 登录: {username}")
+        self._username = username
+
+        try:
+            resp = self._request_with_retry("get", self.login_url)
+            soup = BeautifulSoup(resp.text, "html.parser")
+            fields = self._get_aspnet_fields(soup)
+
+            data = fields.copy()
+            data["txtUserName"] = username
+            data["txtPassword"] = password
+            data["btnSubmit"] = "登 录"
+
+            resp = self._request_with_retry(
+                "post",
+                self.login_url,
+                data=data,
+                headers={
+                    "Content-Type": "application/x-www-form-urlencoded",
+                    "Origin": self.base_url,
+                    "Referer": self.login_url,
+                },
+                allow_redirects=True,
+            )
+
+            if self.index_url_pattern in resp.url:
+                self.logged_in = True
+                self.log(f"[API] 登录成功")
+                return True
+            else:
+                soup = BeautifulSoup(resp.text, "html.parser")
+                error = soup.find(id="lblMsg")
+                error_msg = error.get_text().strip() if error else "未知错误"
+                self.log(f"[API] 登录失败: {error_msg}")
+                return False
+
+        except Exception as e:
+            self.log(f"[API] 登录异常: {str(e)}")
+            return False
+
+    def get_real_name(self) -> Optional[str]:
+        """获取用户真实姓名"""
+        if not self.logged_in:
+            return None
+
+        try:
+            url = f"{self.base_url}/admin/center.aspx"
+            resp = self._request_with_retry("get", url)
+            soup = BeautifulSoup(resp.text, "html.parser")
+
+            nlist = soup.find("div", {"class": "nlist-5"})
+            if nlist:
+                first_li = nlist.find("li")
+                if first_li:
+                    text = first_li.get_text()
+                    match = re.search(r"姓名[：:]\s*([^\(（]+)", text)
+                    if match:
+                        return match.group(1).strip()
+            return None
+        except Exception:
+            return None
+
+    def save_cookies_for_screenshot(self, username: str) -> bool:
+        """保存cookies供wkhtmltoimage使用（Netscape Cookie格式）"""
+        cookies_path = get_cookie_jar_path(username)
+        try:
+            parsed = urlsplit(self.base_url)
+            cookie_domain = parsed.hostname or "postoa.aidunsoft.com"
+
+            lines = [
+                "# Netscape HTTP Cookie File",
+                "# Generated by zsglpt-lite",
+            ]
+            for cookie in self.session.cookies:
+                domain = cookie.domain or cookie_domain
+                include_subdomains = "TRUE" if domain.startswith(".") else "FALSE"
+                path = cookie.path or "/"
+                secure = "TRUE" if getattr(cookie, "secure", False) else "FALSE"
+                expires = int(getattr(cookie, "expires", 0) or 0)
+                lines.append("\t".join([
+                    domain,
+                    include_subdomains,
+                    path,
+                    secure,
+                    str(expires),
+                    cookie.name,
+                    cookie.value,
+                ]))
+
+            with open(cookies_path, "w", encoding="utf-8") as f:
+                f.write("\n".join(lines) + "\n")
+
+            self.log(f"[API] Cookies已保存供截图使用")
+            return True
+        except Exception as e:
+            self.log(f"[API] 保存cookies失败: {e}")
+            return False
+
+    def get_article_list_page(self, bz: int = 0, page: int = 1) -> tuple:
+        """获取单页文章列表"""
+        if not self.logged_in:
+            return [], 0, None
+
+        if page > 1:
+            url = f"{self.base_url}/admin/center.aspx?bz={bz}&page={page}"
+        else:
+            url = f"{self.base_url}/admin/center.aspx?bz={bz}"
+
+        resp = self._request_with_retry("get", url)
+        soup = BeautifulSoup(resp.text, "html.parser")
+        articles = []
+
+        ltable = soup.find("table", {"class": "ltable"})
+        if ltable:
+            rows = ltable.find_all("tr")[1:]
+            for row in rows:
+                if "暂无记录" in row.get_text():
+                    continue
+
+                link = row.find("a", href=True)
+                if link:
+                    href = link.get("href", "")
+                    title = link.get_text().strip()
+                    match = re.search(r"id=(\d+)", href)
+                    article_id = match.group(1) if match else None
+                    articles.append({
+                        "title": title,
+                        "href": href,
+                        "article_id": article_id,
+                    })
+
+        # 获取总页数
+        total_pages = 1
+        total_records = 0
+
+        page_content = soup.find(id="PageContent")
+        if page_content:
+            text = page_content.get_text()
+            total_match = re.search(r"共(\d+)记录", text)
+            if total_match:
+                total_records = int(total_match.group(1))
+                total_pages = (total_records + 9) // 10
+
+        self.last_total_records = total_records
+        return articles, total_pages, None
+
+    def get_article_attachments(self, article_href: str) -> tuple:
+        """获取文章的附件列表和文章信息"""
+        if not article_href.startswith("http"):
+            url = f"{self.base_url}/admin/{article_href}"
+        else:
+            url = article_href
+
+        resp = self._request_with_retry("get", url)
+        soup = BeautifulSoup(resp.text, "html.parser")
+
+        attachments = []
+        article_info = {"channel_id": None, "article_id": None}
+
+        # 从saveread按钮获取channel_id和article_id
+        for elem in soup.find_all(["button", "input"]):
+            onclick = elem.get("onclick", "")
+            match = re.search(r"saveread\((\d+),(\d+)\)", onclick)
+            if match:
+                article_info["channel_id"] = match.group(1)
+                article_info["article_id"] = match.group(2)
+                break
+
+        attach_list = soup.find("div", {"class": "attach-list2"})
+        if attach_list:
+            items = attach_list.find_all("li")
+            for item in items:
+                download_links = item.find_all("a", onclick=re.compile(r"download2?\.ashx"))
+                for link in download_links:
+                    onclick = link.get("onclick", "")
+                    id_match = re.search(r"id=(\d+)", onclick)
+                    channel_match = re.search(r"channel_id=(\d+)", onclick)
+                    if id_match:
+                        attach_id = id_match.group(1)
+                        channel_id = channel_match.group(1) if channel_match else "1"
+                        h3 = item.find("h3")
+                        filename = h3.get_text().strip() if h3 else f"附件{attach_id}"
+                        attachments.append({
+                            "id": attach_id,
+                            "channel_id": channel_id,
+                            "filename": filename
+                        })
+                        break
+
+        return attachments, article_info
+
+    def mark_article_read(self, channel_id: str, article_id: str) -> bool:
+        """通过saveread API标记文章已读"""
+        if not channel_id or not article_id:
+            return False
+
+        import random
+        saveread_url = (
+            f"{self.base_url}/tools/submit_ajax.ashx?action=saveread"
+            f"&time={random.random()}&fl={channel_id}&id={article_id}"
+        )
+
+        try:
+            resp = self._request_with_retry("post", saveread_url)
+            if resp.status_code == 200:
+                try:
+                    data = resp.json()
+                    return data.get("status") == 1
+                except:
+                    return True
+            return False
+        except:
+            return False
+
+    def mark_attachment_read(self, attach_id: str, channel_id: str = "1") -> bool:
+        """通过访问预览通道标记附件已读"""
+        download_url = f"{self.base_url}/tools/download2.ashx?site=main&id={attach_id}&channel_id={channel_id}"
+
+        try:
+            resp = self._request_with_retry("get", download_url, stream=True)
+            resp.close()
+            return resp.status_code == 200
+        except:
+            return False
+
+    def browse_content(
+        self,
+        browse_type: str,
+        should_stop_callback: Optional[Callable] = None,
+        progress_callback: Optional[Callable] = None,
+    ) -> APIBrowseResult:
+        """
+        浏览内容并标记已读
+
+        Args:
+            browse_type: 浏览类型 (应读/注册前未读)
+            should_stop_callback: 检查是否应该停止的回调函数
+            progress_callback: 进度回调，用于实时上报已浏览内容数量
+                              回调参数: {"total_items": int, "browsed_items": int}
+
+        Returns:
+            浏览结果
+        """
+        result = APIBrowseResult(success=False)
+
+        if not self.logged_in:
+            result.error_message = "未登录"
+            return result
+
+        # 根据浏览类型确定bz参数（网站更新后 bz=0 为应读）
+        bz = 0
+
+        self.log(f"[API] 开始浏览 '{browse_type}' (bz={bz})...")
+
+        try:
+            total_items = 0
+            total_attachments = 0
+
+            # 获取第一页
+            articles, total_pages, _ = self.get_article_list_page(bz, 1)
+
+            if not articles:
+                self.log(f"[API] '{browse_type}' 没有待处理内容")
+                result.success = True
+                return result
+
+            total_records = self.last_total_records
+            self.log(f"[API] 共 {total_records} 条记录，开始处理...")
+
+            # 上报初始进度
+            if progress_callback:
+                progress_callback({"total_items": total_records, "browsed_items": 0})
+
+            processed_hrefs = set()
+            current_page = 1
+            max_iterations = total_records + 20
+
+            for iteration in range(max_iterations):
+                if should_stop_callback and should_stop_callback():
+                    self.log("[API] 收到停止信号")
+                    break
+
+                if not articles:
+                    break
+
+                new_articles_in_page = 0
+
+                for article in articles:
+                    if should_stop_callback and should_stop_callback():
+                        break
+
+                    article_href = article["href"]
+                    if article_href in processed_hrefs:
+                        continue
+
+                    processed_hrefs.add(article_href)
+                    new_articles_in_page += 1
+                    title = article["title"][:30]
+
+                    # 获取附件和文章信息
+                    try:
+                        attachments, article_info = self.get_article_attachments(article_href)
+                    except Exception as e:
+                        self.log(f"[API] 获取文章失败: {title} | {str(e)}")
+                        continue
+
+                    total_items += 1
+
+                    # 标记文章已读
+                    article_marked = False
+                    if article_info.get("channel_id") and article_info.get("article_id"):
+                        article_marked = self.mark_article_read(
+                            article_info["channel_id"],
+                            article_info["article_id"]
+                        )
+
+                    # 处理附件
+                    if attachments:
+                        for attach in attachments:
+                            if self.mark_attachment_read(attach["id"], attach["channel_id"]):
+                                total_attachments += 1
+                        self.log(f"[API] [{total_items}] {title} - {len(attachments)}个附件")
+                    else:
+                        status = "已标记" if article_marked else "标记失败"
+                        self.log(f"[API] [{total_items}] {title} - 无附件({status})")
+
+                    # 上报进度
+                    if progress_callback:
+                        progress_callback({"total_items": total_records, "browsed_items": total_items})
+
+                    # 简单延迟，避免请求太快
+                    time.sleep(0.05)
+
+                # 决定下一步
+                if new_articles_in_page > 0:
+                    current_page = 1
+                else:
+                    current_page += 1
+                    if current_page > total_pages:
+                        break
+
+                # 获取下一页
+                try:
+                    articles, new_total_pages, _ = self.get_article_list_page(bz, current_page)
+                    if new_total_pages > 0:
+                        total_pages = new_total_pages
+                except Exception as e:
+                    self.log(f"[API] 获取第{current_page}页列表失败: {str(e)}")
+                    break
+
+            self.log(f"[API] 浏览完成: {total_items} 条内容，{total_attachments} 个附件")
+            result.success = True
+            result.total_items = total_items
+            result.total_attachments = total_attachments
+            return result
+
+        except Exception as e:
+            result.error_message = str(e)
+            self.log(f"[API] 浏览出错: {str(e)}")
+            return result
+
+    def close(self):
+        """关闭会话"""
+        if self._closed:
+            return
+        self._closed = True
+        try:
+            self.session.close()
+        except:
+            pass
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()
+        return False