fix(api): 超时按单条处理避免中途结束

2025-12-17 15:49:05 +08:00
parent 6827d11f40
commit 3f667dd21b
1 changed files with 137 additions and 108 deletions
--- a/api_browser.py
+++ b/api_browser.py
@@ -233,105 +233,99 @@ class APIBrowser:
        if not self.logged_in:
            return [], 0, None

+        if base_url and page > 1:
+            url = re.sub(r'page=\d+', f'page={page}', base_url)
+        elif page > 1:
+            # 兼容兜底：若没有 next_url（极少数情况下页面不提供“下一页”链接），尝试直接拼 page 参数
+            url = f"{BASE_URL}/admin/center.aspx?bz={bz}&page={page}"
+        else:
+            url = f"{BASE_URL}/admin/center.aspx?bz={bz}"
+
+        resp = self._request_with_retry('get', url)
+        soup = BeautifulSoup(resp.text, 'html.parser')
+        articles = []
+
+        ltable = soup.find('table', {'class': 'ltable'})
+        if ltable:
+            rows = ltable.find_all('tr')[1:]
+            for row in rows:
+                # 检查是否是"暂无记录"
+                if '暂无记录' in row.get_text():
+                    continue
+
+                link = row.find('a', href=True)
+                if link:
+                    href = link.get('href', '')
+                    title = link.get_text().strip()
+
+                    match = re.search(r'id=(\d+)', href)
+                    article_id = match.group(1) if match else None
+
+                    articles.append({
+                        'title': title,
+                        'href': href,
+                        'article_id': article_id,
+                    })
+
+        # 获取总页数
+        total_pages = 1
+        next_page_url = None
+        total_records = 0
+
+        page_content = soup.find(id='PageContent')
+        if page_content:
+            text = page_content.get_text()
+            total_match = re.search(r'共(\d+)记录', text)
+            if total_match:
+                total_records = int(total_match.group(1))
+                total_pages = (total_records + 9) // 10
+
+            next_link = page_content.find('a', string=re.compile('下一页'))
+            if next_link:
+                next_href = next_link.get('href', '')
+                if next_href:
+                    next_page_url = f"{BASE_URL}/admin/{next_href}"
+
        try:
-            if base_url and page > 1:
-                url = re.sub(r'page=\d+', f'page={page}', base_url)
-            else:
-                url = f"{BASE_URL}/admin/center.aspx?bz={bz}"
-
-            resp = self._request_with_retry('get', url)
-            soup = BeautifulSoup(resp.text, 'html.parser')
-            articles = []
-
-            ltable = soup.find('table', {'class': 'ltable'})
-            if ltable:
-                rows = ltable.find_all('tr')[1:]
-                for row in rows:
-                    # 检查是否是"暂无记录"
-                    if '暂无记录' in row.get_text():
-                        continue
-
-                    link = row.find('a', href=True)
-                    if link:
-                        href = link.get('href', '')
-                        title = link.get_text().strip()
-
-                        match = re.search(r'id=(\d+)', href)
-                        article_id = match.group(1) if match else None
-
-                        articles.append({
-                            'title': title,
-                            'href': href,
-                            'article_id': article_id,
-                        })
-
-            # 获取总页数
-            total_pages = 1
-            next_page_url = None
-            total_records = 0
-
-            page_content = soup.find(id='PageContent')
-            if page_content:
-                text = page_content.get_text()
-                total_match = re.search(r'共(\d+)记录', text)
-                if total_match:
-                    total_records = int(total_match.group(1))
-                    total_pages = (total_records + 9) // 10
-
-                next_link = page_content.find('a', string=re.compile('下一页'))
-                if next_link:
-                    next_href = next_link.get('href', '')
-                    if next_href:
-                        next_page_url = f"{BASE_URL}/admin/{next_href}"
-
-            try:
-                self.last_total_records = int(total_records or 0)
-            except Exception:
-                self.last_total_records = 0
-            return articles, total_pages, next_page_url
-
-        except Exception as e:
-            self.log(f"[API] 获取列表失败: {str(e)}")
-            return [], 0, None
+            self.last_total_records = int(total_records or 0)
+        except Exception:
+            self.last_total_records = 0
+        return articles, total_pages, next_page_url

    def get_article_attachments(self, article_href: str):
        """获取文章的附件列表"""
-        try:
-            if not article_href.startswith('http'):
-                url = f"{BASE_URL}/admin/{article_href}"
-            else:
-                url = article_href
+        if not article_href.startswith('http'):
+            url = f"{BASE_URL}/admin/{article_href}"
+        else:
+            url = article_href

-            resp = self._request_with_retry('get', url)
-            soup = BeautifulSoup(resp.text, 'html.parser')
+        resp = self._request_with_retry('get', url)
+        soup = BeautifulSoup(resp.text, 'html.parser')

-            attachments = []
+        attachments = []

-            attach_list = soup.find('div', {'class': 'attach-list2'})
-            if attach_list:
-                items = attach_list.find_all('li')
-                for item in items:
-                    download_links = item.find_all('a', onclick=re.compile(r'download\.ashx'))
-                    for link in download_links:
-                        onclick = link.get('onclick', '')
-                        id_match = re.search(r'id=(\d+)', onclick)
-                        channel_match = re.search(r'channel_id=(\d+)', onclick)
-                        if id_match:
-                            attach_id = id_match.group(1)
-                            channel_id = channel_match.group(1) if channel_match else '1'
-                            h3 = item.find('h3')
-                            filename = h3.get_text().strip() if h3 else f'附件{attach_id}'
-                            attachments.append({
-                                'id': attach_id,
-                                'channel_id': channel_id,
-                                'filename': filename
-                            })
-                            break
+        attach_list = soup.find('div', {'class': 'attach-list2'})
+        if attach_list:
+            items = attach_list.find_all('li')
+            for item in items:
+                download_links = item.find_all('a', onclick=re.compile(r'download\.ashx'))
+                for link in download_links:
+                    onclick = link.get('onclick', '')
+                    id_match = re.search(r'id=(\d+)', onclick)
+                    channel_match = re.search(r'channel_id=(\d+)', onclick)
+                    if id_match:
+                        attach_id = id_match.group(1)
+                        channel_id = channel_match.group(1) if channel_match else '1'
+                        h3 = item.find('h3')
+                        filename = h3.get_text().strip() if h3 else f'附件{attach_id}'
+                        attachments.append({
+                            'id': attach_id,
+                            'channel_id': channel_id,
+                            'filename': filename
+                        })
+                        break

-            return attachments
-
-        except Exception as e:
-            return []
+        return attachments

    def mark_read(self, attach_id: str, channel_id: str = '1') -> bool:
        """通过访问下载链接标记已读"""
@@ -383,9 +377,19 @@ class APIBrowser:
            total_attachments = 0
            page = 1
            base_url = None
+            skipped_items = 0
+            skipped_pages = 0
+            consecutive_failures = 0
+            max_consecutive_failures = 3

            # 获取第一页
-            articles, total_pages, next_url = self.get_article_list_page(bz, page)
+            try:
+                articles, total_pages, next_url = self.get_article_list_page(bz, page)
+                consecutive_failures = 0
+            except Exception as e:
+                result.error_message = str(e)
+                self.log(f"[API] 获取第1页列表失败: {str(e)}")
+                return result

            if not articles:
                self.log(f"[API] '{browse_type}' 没有待处理内容")
@@ -396,6 +400,8 @@ class APIBrowser:

            if next_url:
                base_url = next_url
+            elif total_pages > 1:
+                base_url = f"{BASE_URL}/admin/center.aspx?bz={bz}&page=2"

            total_records = int(getattr(self, "last_total_records", 0) or 0)
            last_report_ts = 0.0
@@ -416,22 +422,51 @@ class APIBrowser:
            report_progress(force=True)

            # 处理所有页面
-            while True:
+            while page <= total_pages:
                if should_stop_callback and should_stop_callback():
                    self.log("[API] 收到停止信号")
                    break

+                # page==1 已取过，后续页在这里获取
+                if page > 1:
+                    try:
+                        articles, _, next_url = self.get_article_list_page(bz, page, base_url)
+                        consecutive_failures = 0
+                        if next_url:
+                            base_url = next_url
+                    except Exception as e:
+                        skipped_pages += 1
+                        consecutive_failures += 1
+                        self.log(
+                            f"[API] 获取第{page}页列表失败，跳过本页（连续失败{consecutive_failures}/{max_consecutive_failures}）: {str(e)}"
+                        )
+                        if consecutive_failures >= max_consecutive_failures:
+                            raise
+                        page += 1
+                        continue
+
                for article in articles:
                    if should_stop_callback and should_stop_callback():
                        break

                    title = article['title'][:30]
+                    # 获取附件（文章详情页）
+                    try:
+                        attachments = self.get_article_attachments(article['href'])
+                        consecutive_failures = 0
+                    except Exception as e:
+                        skipped_items += 1
+                        consecutive_failures += 1
+                        self.log(
+                            f"[API] 获取文章失败，跳过（连续失败{consecutive_failures}/{max_consecutive_failures}）: {title} | {str(e)}"
+                        )
+                        if consecutive_failures >= max_consecutive_failures:
+                            raise
+                        continue
+
                    total_items += 1
                    report_progress()

-                    # 获取附件
-                    attachments = self.get_article_attachments(article['href'])
-
                    if attachments:
                        for attach in attachments:
                            if self.mark_read(attach['id'], attach['channel_id']):
@@ -441,22 +476,16 @@ class APIBrowser:

                    time.sleep(0.1)

-                # 下一页
                page += 1
-                if page > total_pages:
-                    break
-
-                articles, _, next_url = self.get_article_list_page(bz, page, base_url)
-                if not articles:
-                    break
-
-                if next_url:
-                    base_url = next_url
-
                time.sleep(0.2)

            report_progress(force=True)
-            self.log(f"[API] 浏览完成: {total_items} 条内容，{total_attachments} 个附件")
+            if skipped_items or skipped_pages:
+                self.log(
+                    f"[API] 浏览完成: {total_items} 条内容，{total_attachments} 个附件（跳过 {skipped_items} 条内容，{skipped_pages} 页列表）"
+                )
+            else:
+                self.log(f"[API] 浏览完成: {total_items} 条内容，{total_attachments} 个附件")

            result.success = True
            result.total_items = total_items