🎉 项目优化与Bug修复完整版

✨ 主要优化成果： - 修复Unicode字符编码问题（Windows跨平台兼容性） - 安装wkhtmltoimage，截图功能完全修复 - 智能延迟优化（api_browser.py） - 线程池资源泄漏修复（tasks.py） - HTML解析缓存机制 - 二分搜索算法优化（kdocs_uploader.py） - 自适应资源配置（browser_pool_worker.py） 🐛 Bug修复： - 解决截图失败问题 - 修复管理员密码设置 - 解决应用启动编码错误 📚 新增文档： - BUG_REPORT.md - 完整bug分析报告 - PERFORMANCE_ANALYSIS_REPORT.md - 性能优化分析 - LINUX_DEPLOYMENT_ANALYSIS.md - Linux部署指南 - SCREENSHOT_FIX_SUCCESS.md - 截图功能修复记录 - INSTALL_WKHTMLTOIMAGE.md - 安装指南 - OPTIMIZATION_FIXES_SUMMARY.md - 优化总结 🚀 功能验证： - Flask应用正常运行（51233端口） - 数据库、截图线程池、API预热正常 - 管理员登录：admin/admin123 - 健康检查API：http://127.0.0.1:51233/health 💡 技术改进： - 智能延迟算法（自适应调整） - LRU缓存策略 - 线程池资源管理优化 - 二分搜索算法（O(log n) vs O(n)） - 自适应资源管理 🎯 项目现在稳定运行，可部署到Linux环境
2026-01-16 17:39:55 +08:00
parent 722dccdc78
commit 7e9a772104
47 changed files with 9382 additions and 749 deletions
--- a/api_browser.py
+++ b/api_browser.py
@@ -15,14 +15,78 @@ import weakref
 from typing import Optional, Callable
 from dataclasses import dataclass
 from urllib.parse import urlsplit
+import threading

 from app_config import get_config

 import time as _time_module
+
 _MODULE_START_TIME = _time_module.time()
 _WARMUP_PERIOD_SECONDS = 60  # 启动后 60 秒内使用更长超时
 _WARMUP_TIMEOUT_SECONDS = 15.0  # 预热期间的超时时间

+
+# HTML解析缓存类
+class HTMLParseCache:
+    """HTML解析结果缓存"""
+
+    def __init__(self, ttl: int = 300, maxsize: int = 1000):
+        self.cache = {}
+        self.ttl = ttl
+        self.maxsize = maxsize
+        self._access_times = {}
+        self._lock = threading.RLock()
+
+    def _make_key(self, url: str, content_hash: str) -> str:
+        return f"{url}:{content_hash}"
+
+    def get(self, key: str) -> Optional[tuple]:
+        """获取缓存，如果存在且未过期"""
+        with self._lock:
+            if key in self.cache:
+                value, timestamp = self.cache[key]
+                if time.time() - timestamp < self.ttl:
+                    self._access_times[key] = time.time()
+                    return value
+                else:
+                    # 过期删除
+                    del self.cache[key]
+                    del self._access_times[key]
+            return None
+
+    def set(self, key: str, value: tuple):
+        """设置缓存"""
+        with self._lock:
+            # 如果缓存已满，删除最久未访问的项
+            if len(self.cache) >= self.maxsize:
+                if self._access_times:
+                    # 使用简单的LRU策略，删除最久未访问的项
+                    oldest_key = None
+                    oldest_time = float("inf")
+                    for key, access_time in self._access_times.items():
+                        if access_time < oldest_time:
+                            oldest_time = access_time
+                            oldest_key = key
+                    if oldest_key:
+                        del self.cache[oldest_key]
+                        del self._access_times[oldest_key]
+
+            self.cache[key] = (value, time.time())
+            self._access_times[key] = time.time()
+
+    def clear(self):
+        """清空缓存"""
+        with self._lock:
+            self.cache.clear()
+            self._access_times.clear()
+
+    def get_lru_key(self) -> Optional[str]:
+        """获取最久未访问的键"""
+        if not self._access_times:
+            return None
+        return min(self._access_times.keys(), key=lambda k: self._access_times[k])
+
+
 config = get_config()

 BASE_URL = getattr(config, "ZSGL_BASE_URL", "https://postoa.aidunsoft.com")
@@ -31,7 +95,9 @@ INDEX_URL_PATTERN = getattr(config, "ZSGL_INDEX_URL_PATTERN", "index.aspx")
 COOKIES_DIR = getattr(config, "COOKIES_DIR", "data/cookies")

 try:
-    _API_REQUEST_TIMEOUT_SECONDS = float(os.environ.get("API_REQUEST_TIMEOUT_SECONDS") or os.environ.get("API_REQUEST_TIMEOUT") or "5")
+    _API_REQUEST_TIMEOUT_SECONDS = float(
+        os.environ.get("API_REQUEST_TIMEOUT_SECONDS") or os.environ.get("API_REQUEST_TIMEOUT") or "5"
+    )
 except Exception:
    _API_REQUEST_TIMEOUT_SECONDS = 5.0
 _API_REQUEST_TIMEOUT_SECONDS = max(3.0, _API_REQUEST_TIMEOUT_SECONDS)
@@ -66,6 +132,7 @@ def is_cookie_jar_fresh(cookie_path: str, max_age_seconds: int = _COOKIE_JAR_MAX
    except Exception:
        return False

+
 _api_browser_instances: "weakref.WeakSet[APIBrowser]" = weakref.WeakSet()


@@ -84,6 +151,7 @@ atexit.register(_cleanup_api_browser_instances)
@dataclass
 class APIBrowseResult:
    """API 浏览结果"""
+
    success: bool
    total_items: int = 0
    total_attachments: int = 0
@@ -95,34 +163,73 @@ class APIBrowser:

    def __init__(self, log_callback: Optional[Callable] = None, proxy_config: Optional[dict] = None):
        self.session = requests.Session()
-        self.session.headers.update({
-            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
-            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
-        })
+        self.session.headers.update(
+            {
+                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+                "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+            }
+        )
        self.logged_in = False
        self.log_callback = log_callback
        self.stop_flag = False
        self._closed = False  # 防止重复关闭
        self.last_total_records = 0

+        # 初始化HTML解析缓存
+        self._parse_cache = HTMLParseCache(ttl=300, maxsize=500)  # 5分钟缓存，最多500条记录
+
        # 设置代理
        if proxy_config and proxy_config.get("server"):
            proxy_server = proxy_config["server"]
-            self.session.proxies = {
-                "http": proxy_server,
-                "https": proxy_server
-            }
+            self.session.proxies = {"http": proxy_server, "https": proxy_server}
            self.proxy_server = proxy_server
        else:
            self.proxy_server = None

        _api_browser_instances.add(self)

+    def _calculate_adaptive_delay(self, iteration: int, consecutive_failures: int) -> float:
+        """
+        智能延迟计算：文章处理延迟
+        根据迭代次数和连续失败次数动态调整延迟
+        """
+        # 基础延迟，显著降低
+        base_delay = 0.03
+
+        # 如果有连续失败，增加延迟但有上限
+        if consecutive_failures > 0:
+            delay = base_delay * (1.5 ** min(consecutive_failures, 3))
+            return min(delay, 0.2)  # 最多200ms
+
+        # 根据处理进度调整延迟，开始时较慢，后来可以更快
+        progress_factor = min(iteration / 100.0, 1.0)  # 100个文章后达到最大优化
+        optimized_delay = base_delay * (1.2 - 0.4 * progress_factor)  # 从120%逐渐降低到80%
+        return max(optimized_delay, 0.02)  # 最少20ms
+
+    def _calculate_page_delay(self, current_page: int, new_articles_in_page: int) -> float:
+        """
+        智能延迟计算：页面处理延迟
+        根据页面位置和新文章数量调整延迟
+        """
+        base_delay = 0.08  # 基础延迟，降低50%
+
+        # 如果当前页有大量新文章，可以稍微增加延迟
+        if new_articles_in_page > 10:
+            return base_delay * 1.2
+
+        # 如果是新页面，降低延迟（内容可能需要加载）
+        if current_page <= 3:
+            return base_delay * 1.1
+
+        # 后续页面可以更快
+        return base_delay * 0.8
+
    def log(self, message: str):
        """记录日志"""
        if self.log_callback:
            self.log_callback(message)
+
    def save_cookies_for_screenshot(self, username: str):
        """保存 cookies 供 wkhtmltoimage 使用（Netscape Cookie 格式）"""
        cookies_path = get_cookie_jar_path(username)
@@ -160,24 +267,22 @@ class APIBrowser:
            self.log(f"[API] 保存cookies失败: {e}")
            return False

-
-
    def _request_with_retry(self, method, url, max_retries=3, retry_delay=1, **kwargs):
        """带重试机制的请求方法"""
        # 启动后 60 秒内使用更长超时（15秒），之后使用配置的超时
        if (_time_module.time() - _MODULE_START_TIME) < _WARMUP_PERIOD_SECONDS:
-            kwargs.setdefault('timeout', _WARMUP_TIMEOUT_SECONDS)
+            kwargs.setdefault("timeout", _WARMUP_TIMEOUT_SECONDS)
        else:
-            kwargs.setdefault('timeout', _API_REQUEST_TIMEOUT_SECONDS)
+            kwargs.setdefault("timeout", _API_REQUEST_TIMEOUT_SECONDS)
        last_error = None
        timeout_value = kwargs.get("timeout")
        diag_enabled = _API_DIAGNOSTIC_LOG
        slow_ms = _API_DIAGNOSTIC_SLOW_MS
-        
+
        for attempt in range(1, max_retries + 1):
            start_ts = _time_module.time()
            try:
-                if method.lower() == 'get':
+                if method.lower() == "get":
                    resp = self.session.get(url, **kwargs)
                else:
                    resp = self.session.post(url, **kwargs)
@@ -198,19 +303,20 @@ class APIBrowser:
                if attempt < max_retries:
                    self.log(f"[API] 请求超时，{retry_delay}秒后重试 ({attempt}/{max_retries})...")
                    import time
+
                    time.sleep(retry_delay)
                else:
                    self.log(f"[API] 请求失败，已重试{max_retries}次: {str(e)}")
-        
+
        raise last_error

    def _get_aspnet_fields(self, soup):
        """获取 ASP.NET 隐藏字段"""
        fields = {}
-        for name in ['__VIEWSTATE', '__VIEWSTATEGENERATOR', '__EVENTVALIDATION']:
-            field = soup.find('input', {'name': name})
+        for name in ["__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION"]:
+            field = soup.find("input", {"name": name})
            if field:
-                fields[name] = field.get('value', '')
+                fields[name] = field.get("value", "")
        return fields

    def get_real_name(self) -> Optional[str]:
@@ -224,18 +330,18 @@ class APIBrowser:

        try:
            url = f"{BASE_URL}/admin/center.aspx"
-            resp = self._request_with_retry('get', url)
-            soup = BeautifulSoup(resp.text, 'html.parser')
+            resp = self._request_with_retry("get", url)
+            soup = BeautifulSoup(resp.text, "html.parser")

            # 查找包含"姓名："的元素
            # 页面格式: <li><p>姓名：喻勇祥(19174616018) 人力资源编码: ...</p></li>
-            nlist = soup.find('div', {'class': 'nlist-5'})
+            nlist = soup.find("div", {"class": "nlist-5"})
            if nlist:
-                first_li = nlist.find('li')
+                first_li = nlist.find("li")
                if first_li:
                    text = first_li.get_text()
                    # 解析姓名：格式为 "姓名：XXX(手机号)"
-                    match = re.search(r'姓名[：:]\s*([^\(（]+)', text)
+                    match = re.search(r"姓名[：:]\s*([^\(（]+)", text)
                    if match:
                        real_name = match.group(1).strip()
                        if real_name:
@@ -249,26 +355,26 @@ class APIBrowser:
        self.log(f"[API] 登录: {username}")

        try:
-            resp = self._request_with_retry('get', LOGIN_URL)
+            resp = self._request_with_retry("get", LOGIN_URL)

-            soup = BeautifulSoup(resp.text, 'html.parser')
+            soup = BeautifulSoup(resp.text, "html.parser")
            fields = self._get_aspnet_fields(soup)

            data = fields.copy()
-            data['txtUserName'] = username
-            data['txtPassword'] = password
-            data['btnSubmit'] = '登 录'
+            data["txtUserName"] = username
+            data["txtPassword"] = password
+            data["btnSubmit"] = "登 录"

            resp = self._request_with_retry(
-                'post',
+                "post",
                LOGIN_URL,
                data=data,
                headers={
-                    'Content-Type': 'application/x-www-form-urlencoded',
-                    'Origin': BASE_URL,
-                    'Referer': LOGIN_URL,
+                    "Content-Type": "application/x-www-form-urlencoded",
+                    "Origin": BASE_URL,
+                    "Referer": LOGIN_URL,
                },
-                allow_redirects=True
+                allow_redirects=True,
            )

            if INDEX_URL_PATTERN in resp.url:
@@ -276,9 +382,9 @@ class APIBrowser:
                self.log(f"[API] 登录成功")
                return True
            else:
-                soup = BeautifulSoup(resp.text, 'html.parser')
-                error = soup.find(id='lblMsg')
-                error_msg = error.get_text().strip() if error else '未知错误'
+                soup = BeautifulSoup(resp.text, "html.parser")
+                error = soup.find(id="lblMsg")
+                error_msg = error.get_text().strip() if error else "未知错误"
                self.log(f"[API] 登录失败: {error_msg}")
                return False

@@ -292,55 +398,57 @@ class APIBrowser:
            return [], 0, None

        if base_url and page > 1:
-            url = re.sub(r'page=\d+', f'page={page}', base_url)
+            url = re.sub(r"page=\d+", f"page={page}", base_url)
        elif page > 1:
            # 兼容兜底：若没有 next_url（极少数情况下页面不提供“下一页”链接），尝试直接拼 page 参数
            url = f"{BASE_URL}/admin/center.aspx?bz={bz}&page={page}"
        else:
            url = f"{BASE_URL}/admin/center.aspx?bz={bz}"

-        resp = self._request_with_retry('get', url)
-        soup = BeautifulSoup(resp.text, 'html.parser')
+        resp = self._request_with_retry("get", url)
+        soup = BeautifulSoup(resp.text, "html.parser")
        articles = []

-        ltable = soup.find('table', {'class': 'ltable'})
+        ltable = soup.find("table", {"class": "ltable"})
        if ltable:
-            rows = ltable.find_all('tr')[1:]
+            rows = ltable.find_all("tr")[1:]
            for row in rows:
                # 检查是否是"暂无记录"
-                if '暂无记录' in row.get_text():
+                if "暂无记录" in row.get_text():
                    continue

-                link = row.find('a', href=True)
+                link = row.find("a", href=True)
                if link:
-                    href = link.get('href', '')
+                    href = link.get("href", "")
                    title = link.get_text().strip()

-                    match = re.search(r'id=(\d+)', href)
+                    match = re.search(r"id=(\d+)", href)
                    article_id = match.group(1) if match else None

-                    articles.append({
-                        'title': title,
-                        'href': href,
-                        'article_id': article_id,
-                    })
+                    articles.append(
+                        {
+                            "title": title,
+                            "href": href,
+                            "article_id": article_id,
+                        }
+                    )

        # 获取总页数
        total_pages = 1
        next_page_url = None
        total_records = 0

-        page_content = soup.find(id='PageContent')
+        page_content = soup.find(id="PageContent")
        if page_content:
            text = page_content.get_text()
-            total_match = re.search(r'共(\d+)记录', text)
+            total_match = re.search(r"共(\d+)记录", text)
            if total_match:
                total_records = int(total_match.group(1))
                total_pages = (total_records + 9) // 10

-            next_link = page_content.find('a', string=re.compile('下一页'))
+            next_link = page_content.find("a", string=re.compile("下一页"))
            if next_link:
-                next_href = next_link.get('href', '')
+                next_href = next_link.get("href", "")
                if next_href:
                    next_page_url = f"{BASE_URL}/admin/{next_href}"

@@ -351,56 +459,55 @@ class APIBrowser:
        return articles, total_pages, next_page_url

    def get_article_attachments(self, article_href: str):
-        """
-        获取文章的附件列表和文章信息
-
-        Returns:
-            tuple: (attachments_list, article_info)
-            - attachments_list: 附件列表
-            - article_info: 包含 channel_id 和 article_id 的字典，用于标记文章已读
-        """
-        if not article_href.startswith('http'):
+        """获取文章的附件列表和文章信息"""
+        if not article_href.startswith("http"):
            url = f"{BASE_URL}/admin/{article_href}"
        else:
            url = article_href

-        resp = self._request_with_retry('get', url)
-        soup = BeautifulSoup(resp.text, 'html.parser')
+        # 先检查缓存，避免不必要的请求
+        # 使用URL作为缓存键（简化版本）
+        cache_key = f"attachments_{hash(url)}"
+        cached_result = self._parse_cache.get(cache_key)
+        if cached_result:
+            return cached_result
+
+        resp = self._request_with_retry("get", url)
+        soup = BeautifulSoup(resp.text, "html.parser")

        attachments = []
-        article_info = {'channel_id': None, 'article_id': None}
+        article_info = {"channel_id": None, "article_id": None}

        # 从 saveread 按钮获取 channel_id 和 article_id
-        for elem in soup.find_all(['button', 'input']):
-            onclick = elem.get('onclick', '')
-            match = re.search(r'saveread\((\d+),(\d+)\)', onclick)
+        for elem in soup.find_all(["button", "input"]):
+            onclick = elem.get("onclick", "")
+            match = re.search(r"saveread\((\d+),(\d+)\)", onclick)
            if match:
-                article_info['channel_id'] = match.group(1)
-                article_info['article_id'] = match.group(2)
+                article_info["channel_id"] = match.group(1)
+                article_info["article_id"] = match.group(2)
                break

-        attach_list = soup.find('div', {'class': 'attach-list2'})
+        attach_list = soup.find("div", {"class": "attach-list2"})
        if attach_list:
-            items = attach_list.find_all('li')
+            items = attach_list.find_all("li")
            for item in items:
-                download_links = item.find_all('a', onclick=re.compile(r'download2?\.ashx'))
+                download_links = item.find_all("a", onclick=re.compile(r"download2?\.ashx"))
                for link in download_links:
-                    onclick = link.get('onclick', '')
-                    id_match = re.search(r'id=(\d+)', onclick)
-                    channel_match = re.search(r'channel_id=(\d+)', onclick)
+                    onclick = link.get("onclick", "")
+                    id_match = re.search(r"id=(\d+)", onclick)
+                    channel_match = re.search(r"channel_id=(\d+)", onclick)
                    if id_match:
                        attach_id = id_match.group(1)
-                        channel_id = channel_match.group(1) if channel_match else '1'
-                        h3 = item.find('h3')
-                        filename = h3.get_text().strip() if h3 else f'附件{attach_id}'
-                        attachments.append({
-                            'id': attach_id,
-                            'channel_id': channel_id,
-                            'filename': filename
-                        })
+                        channel_id = channel_match.group(1) if channel_match else "1"
+                        h3 = item.find("h3")
+                        filename = h3.get_text().strip() if h3 else f"附件{attach_id}"
+                        attachments.append({"id": attach_id, "channel_id": channel_id, "filename": filename})
                        break

-        return attachments, article_info
+        result = (attachments, article_info)
+        # 存入缓存
+        self._parse_cache.set(cache_key, result)
+        return result

    def mark_article_read(self, channel_id: str, article_id: str) -> bool:
        """通过 saveread API 标记文章已读"""
@@ -408,7 +515,10 @@ class APIBrowser:
            return False

        import random
-        saveread_url = f"{BASE_URL}/tools/submit_ajax.ashx?action=saveread&time={random.random()}&fl={channel_id}&id={article_id}"
+
+        saveread_url = (
+            f"{BASE_URL}/tools/submit_ajax.ashx?action=saveread&time={random.random()}&fl={channel_id}&id={article_id}"
+        )

        try:
            resp = self._request_with_retry("post", saveread_url)
@@ -416,14 +526,14 @@ class APIBrowser:
            if resp.status_code == 200:
                try:
                    data = resp.json()
-                    return data.get('status') == 1
+                    return data.get("status") == 1
                except:
                    return True  # 如果不是 JSON 但状态码 200，也认为成功
            return False
        except:
            return False

-    def mark_read(self, attach_id: str, channel_id: str = '1') -> bool:
+    def mark_read(self, attach_id: str, channel_id: str = "1") -> bool:
        """通过访问预览通道标记附件已读"""
        download_url = f"{BASE_URL}/tools/download2.ashx?site=main&id={attach_id}&channel_id={channel_id}"

@@ -461,7 +571,7 @@ class APIBrowser:
        # 网站更新后参数: 0=应读, 1=已读（注册前未读需通过页面交互切换）
        # 当前前端选项: 注册前未读、应读（默认应读）
        browse_type_text = str(browse_type or "")
-        if '注册前' in browse_type_text:
+        if "注册前" in browse_type_text:
            bz = 0  # 注册前未读（暂与应读相同，网站通过页面状态区分）
        else:
            bz = 0  # 应读
@@ -528,14 +638,14 @@ class APIBrowser:
                    if should_stop_callback and should_stop_callback():
                        break

-                    article_href = article['href']
+                    article_href = article["href"]
                    # 跳过已处理的文章
                    if article_href in processed_hrefs:
                        continue

                    processed_hrefs.add(article_href)
                    new_articles_in_page += 1
-                    title = article['title'][:30]
+                    title = article["title"][:30]

                    # 获取附件和文章信息（文章详情页）
                    try:
@@ -556,16 +666,13 @@ class APIBrowser:

                    # 标记文章已读（调用 saveread API）
                    article_marked = False
-                    if article_info.get('channel_id') and article_info.get('article_id'):
-                        article_marked = self.mark_article_read(
-                            article_info['channel_id'],
-                            article_info['article_id']
-                        )
+                    if article_info.get("channel_id") and article_info.get("article_id"):
+                        article_marked = self.mark_article_read(article_info["channel_id"], article_info["article_id"])

                    # 处理附件（如果有）
                    if attachments:
                        for attach in attachments:
-                            if self.mark_read(attach['id'], attach['channel_id']):
+                            if self.mark_read(attach["id"], attach["channel_id"]):
                                total_attachments += 1

                        self.log(f"[API] [{total_items}] {title} - {len(attachments)}个附件")
@@ -574,9 +681,10 @@ class APIBrowser:
                        status = "已标记" if article_marked else "标记失败"
                        self.log(f"[API] [{total_items}] {title} - 无附件({status})")

-                    time.sleep(0.1)
+                    # 智能延迟策略：根据连续失败次数和文章数量动态调整
+                    time.sleep(self._calculate_adaptive_delay(total_items, consecutive_failures))

-                time.sleep(0.2)
+                time.sleep(self._calculate_page_delay(current_page, new_articles_in_page))

                # 决定下一步获取哪一页
                if new_articles_in_page > 0:
@@ -599,7 +707,9 @@ class APIBrowser:

            report_progress(force=True)
            if skipped_items:
-                self.log(f"[API] 浏览完成: {total_items} 条内容，{total_attachments} 个附件（跳过 {skipped_items} 条内容）")
+                self.log(
+                    f"[API] 浏览完成: {total_items} 条内容，{total_attachments} 个附件（跳过 {skipped_items} 条内容）"
+                )
            else:
                self.log(f"[API] 浏览完成: {total_items} 条内容，{total_attachments} 个附件")

@@ -656,7 +766,7 @@ def warmup_api_connection(proxy_config: Optional[dict] = None, log_callback: Opt

        # 发送一个轻量级请求建立连接
        resp = session.get(f"{BASE_URL}/admin/login.aspx", timeout=10, allow_redirects=False)
-        log(f"✓ API 连接预热完成 (status={resp.status_code})")
+        log(f"[OK] API 连接预热完成 (status={resp.status_code})")
        session.close()
        return True
    except Exception as e: