fix(api): 超时按单条处理避免中途结束

2025-12-17 15:49:05 +08:00
parent 6827d11f40
commit 3f667dd21b
1 changed files with 137 additions and 108 deletions
--- a/api_browser.py
+++ b/api_browser.py
@@ -233,105 +233,99 @@ class APIBrowser:
        if not self.logged_in:
            return [], 0, None
        if base_url and page > 1:
            url = re.sub(r'page=\d+', f'page={page}', base_url)
        elif page > 1:
            # 兼容兜底：若没有 next_url（极少数情况下页面不提供“下一页”链接），尝试直接拼 page 参数
            url = f"{BASE_URL}/admin/center.aspx?bz={bz}&page={page}"
        else:
            url = f"{BASE_URL}/admin/center.aspx?bz={bz}"
        resp = self._request_with_retry('get', url)
        soup = BeautifulSoup(resp.text, 'html.parser')
        articles = []
        ltable = soup.find('table', {'class': 'ltable'})
        if ltable:
            rows = ltable.find_all('tr')[1:]
            for row in rows:
                # 检查是否是"暂无记录"
                if '暂无记录' in row.get_text():
                    continue
                link = row.find('a', href=True)
                if link:
                    href = link.get('href', '')
                    title = link.get_text().strip()
                    match = re.search(r'id=(\d+)', href)
                    article_id = match.group(1) if match else None
                    articles.append({
                        'title': title,
                        'href': href,
                        'article_id': article_id,
                    })
        # 获取总页数
        total_pages = 1
        next_page_url = None
        total_records = 0
        page_content = soup.find(id='PageContent')
        if page_content:
            text = page_content.get_text()
            total_match = re.search(r'共(\d+)记录', text)
            if total_match:
                total_records = int(total_match.group(1))
                total_pages = (total_records + 9) // 10
            next_link = page_content.find('a', string=re.compile('下一页'))
            if next_link:
                next_href = next_link.get('href', '')
                if next_href:
                    next_page_url = f"{BASE_URL}/admin/{next_href}"
        try:
-            if base_url and page > 1:
+            self.last_total_records = int(total_records or 0)
-                url = re.sub(r'page=\d+', f'page={page}', base_url)
+        except Exception:
-            else:
+            self.last_total_records = 0
-                url = f"{BASE_URL}/admin/center.aspx?bz={bz}"
+        return articles, total_pages, next_page_url
            resp = self._request_with_retry('get', url)
            soup = BeautifulSoup(resp.text, 'html.parser')
            articles = []
            ltable = soup.find('table', {'class': 'ltable'})
            if ltable:
                rows = ltable.find_all('tr')[1:]
                for row in rows:
                    # 检查是否是"暂无记录"
                    if '暂无记录' in row.get_text():
                        continue
                    link = row.find('a', href=True)
                    if link:
                        href = link.get('href', '')
                        title = link.get_text().strip()
                        match = re.search(r'id=(\d+)', href)
                        article_id = match.group(1) if match else None
                        articles.append({
                            'title': title,
                            'href': href,
                            'article_id': article_id,
                        })
            # 获取总页数
            total_pages = 1
            next_page_url = None
            total_records = 0
            page_content = soup.find(id='PageContent')
            if page_content:
                text = page_content.get_text()
                total_match = re.search(r'共(\d+)记录', text)
                if total_match:
                    total_records = int(total_match.group(1))
                    total_pages = (total_records + 9) // 10
                next_link = page_content.find('a', string=re.compile('下一页'))
                if next_link:
                    next_href = next_link.get('href', '')
                    if next_href:
                        next_page_url = f"{BASE_URL}/admin/{next_href}"
            try:
                self.last_total_records = int(total_records or 0)
            except Exception:
                self.last_total_records = 0
            return articles, total_pages, next_page_url
        except Exception as e:
            self.log(f"[API] 获取列表失败: {str(e)}")
            return [], 0, None
    def get_article_attachments(self, article_href: str):
        """获取文章的附件列表"""
-        try:
+        if not article_href.startswith('http'):
-            if not article_href.startswith('http'):
+            url = f"{BASE_URL}/admin/{article_href}"
-                url = f"{BASE_URL}/admin/{article_href}"
+        else:
-            else:
+            url = article_href
                url = article_href
-            resp = self._request_with_retry('get', url)
+        resp = self._request_with_retry('get', url)
-            soup = BeautifulSoup(resp.text, 'html.parser')
+        soup = BeautifulSoup(resp.text, 'html.parser')
-            attachments = []
+        attachments = []
-            attach_list = soup.find('div', {'class': 'attach-list2'})
+        attach_list = soup.find('div', {'class': 'attach-list2'})
-            if attach_list:
+        if attach_list:
-                items = attach_list.find_all('li')
+            items = attach_list.find_all('li')
-                for item in items:
+            for item in items:
-                    download_links = item.find_all('a', onclick=re.compile(r'download\.ashx'))
+                download_links = item.find_all('a', onclick=re.compile(r'download\.ashx'))
-                    for link in download_links:
+                for link in download_links:
-                        onclick = link.get('onclick', '')
+                    onclick = link.get('onclick', '')
-                        id_match = re.search(r'id=(\d+)', onclick)
+                    id_match = re.search(r'id=(\d+)', onclick)
-                        channel_match = re.search(r'channel_id=(\d+)', onclick)
+                    channel_match = re.search(r'channel_id=(\d+)', onclick)
-                        if id_match:
+                    if id_match:
-                            attach_id = id_match.group(1)
+                        attach_id = id_match.group(1)
-                            channel_id = channel_match.group(1) if channel_match else '1'
+                        channel_id = channel_match.group(1) if channel_match else '1'
-                            h3 = item.find('h3')
+                        h3 = item.find('h3')
-                            filename = h3.get_text().strip() if h3 else f'附件{attach_id}'
+                        filename = h3.get_text().strip() if h3 else f'附件{attach_id}'
-                            attachments.append({
+                        attachments.append({
-                                'id': attach_id,
+                            'id': attach_id,
-                                'channel_id': channel_id,
+                            'channel_id': channel_id,
-                                'filename': filename
+                            'filename': filename
-                            })
+                        })
-                            break
+                        break
-            return attachments
+        return attachments
        except Exception as e:
            return []
    def mark_read(self, attach_id: str, channel_id: str = '1') -> bool:
        """通过访问下载链接标记已读"""
@@ -383,9 +377,19 @@ class APIBrowser:
            total_attachments = 0
            page = 1
            base_url = None
            skipped_items = 0
            skipped_pages = 0
            consecutive_failures = 0
            max_consecutive_failures = 3
            # 获取第一页
-            articles, total_pages, next_url = self.get_article_list_page(bz, page)
+            try:
                articles, total_pages, next_url = self.get_article_list_page(bz, page)
                consecutive_failures = 0
            except Exception as e:
                result.error_message = str(e)
                self.log(f"[API] 获取第1页列表失败: {str(e)}")
                return result
            if not articles:
                self.log(f"[API] '{browse_type}' 没有待处理内容")
@@ -396,6 +400,8 @@ class APIBrowser:
            if next_url:
                base_url = next_url
            elif total_pages > 1:
                base_url = f"{BASE_URL}/admin/center.aspx?bz={bz}&page=2"
            total_records = int(getattr(self, "last_total_records", 0) or 0)
            last_report_ts = 0.0
@@ -416,22 +422,51 @@ class APIBrowser:
            report_progress(force=True)
            # 处理所有页面
-            while True:
+            while page <= total_pages:
                if should_stop_callback and should_stop_callback():
                    self.log("[API] 收到停止信号")
                    break
                # page==1 已取过，后续页在这里获取
                if page > 1:
                    try:
                        articles, _, next_url = self.get_article_list_page(bz, page, base_url)
                        consecutive_failures = 0
                        if next_url:
                            base_url = next_url
                    except Exception as e:
                        skipped_pages += 1
                        consecutive_failures += 1
                        self.log(
                            f"[API] 获取第{page}页列表失败，跳过本页（连续失败{consecutive_failures}/{max_consecutive_failures}）: {str(e)}"
                        )
                        if consecutive_failures >= max_consecutive_failures:
                            raise
                        page += 1
                        continue
                for article in articles:
                    if should_stop_callback and should_stop_callback():
                        break
                    title = article['title'][:30]
                    # 获取附件（文章详情页）
                    try:
                        attachments = self.get_article_attachments(article['href'])
                        consecutive_failures = 0
                    except Exception as e:
                        skipped_items += 1
                        consecutive_failures += 1
                        self.log(
                            f"[API] 获取文章失败，跳过（连续失败{consecutive_failures}/{max_consecutive_failures}）: {title} | {str(e)}"
                        )
                        if consecutive_failures >= max_consecutive_failures:
                            raise
                        continue
                    total_items += 1
                    report_progress()
                    # 获取附件
                    attachments = self.get_article_attachments(article['href'])
                    if attachments:
                        for attach in attachments:
                            if self.mark_read(attach['id'], attach['channel_id']):
@@ -441,22 +476,16 @@ class APIBrowser:
                    time.sleep(0.1)
                # 下一页
                page += 1
                if page > total_pages:
                    break
                articles, _, next_url = self.get_article_list_page(bz, page, base_url)
                if not articles:
                    break
                if next_url:
                    base_url = next_url
                time.sleep(0.2)
            report_progress(force=True)
-            self.log(f"[API] 浏览完成: {total_items} 条内容，{total_attachments} 个附件")
+            if skipped_items or skipped_pages:
                self.log(
                    f"[API] 浏览完成: {total_items} 条内容，{total_attachments} 个附件（跳过 {skipped_items} 条内容，{skipped_pages} 页列表）"
                )
            else:
                self.log(f"[API] 浏览完成: {total_items} 条内容，{total_attachments} 个附件")
            result.success = True
            result.total_items = total_items