From 3f667dd21bddf84b6c42a0b68a2165361caed0c8 Mon Sep 17 00:00:00 2001 From: yuyx <237899745@qq.com> Date: Wed, 17 Dec 2025 15:49:05 +0800 Subject: [PATCH] =?UTF-8?q?fix(api):=20=E8=B6=85=E6=97=B6=E6=8C=89?= =?UTF-8?q?=E5=8D=95=E6=9D=A1=E5=A4=84=E7=90=86=E9=81=BF=E5=85=8D=E4=B8=AD?= =?UTF-8?q?=E9=80=94=E7=BB=93=E6=9D=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api_browser.py | 245 +++++++++++++++++++++++++++---------------------- 1 file changed, 137 insertions(+), 108 deletions(-) diff --git a/api_browser.py b/api_browser.py index 67b4e05..577e9ed 100755 --- a/api_browser.py +++ b/api_browser.py @@ -233,105 +233,99 @@ class APIBrowser: if not self.logged_in: return [], 0, None + if base_url and page > 1: + url = re.sub(r'page=\d+', f'page={page}', base_url) + elif page > 1: + # 兼容兜底:若没有 next_url(极少数情况下页面不提供“下一页”链接),尝试直接拼 page 参数 + url = f"{BASE_URL}/admin/center.aspx?bz={bz}&page={page}" + else: + url = f"{BASE_URL}/admin/center.aspx?bz={bz}" + + resp = self._request_with_retry('get', url) + soup = BeautifulSoup(resp.text, 'html.parser') + articles = [] + + ltable = soup.find('table', {'class': 'ltable'}) + if ltable: + rows = ltable.find_all('tr')[1:] + for row in rows: + # 检查是否是"暂无记录" + if '暂无记录' in row.get_text(): + continue + + link = row.find('a', href=True) + if link: + href = link.get('href', '') + title = link.get_text().strip() + + match = re.search(r'id=(\d+)', href) + article_id = match.group(1) if match else None + + articles.append({ + 'title': title, + 'href': href, + 'article_id': article_id, + }) + + # 获取总页数 + total_pages = 1 + next_page_url = None + total_records = 0 + + page_content = soup.find(id='PageContent') + if page_content: + text = page_content.get_text() + total_match = re.search(r'共(\d+)记录', text) + if total_match: + total_records = int(total_match.group(1)) + total_pages = (total_records + 9) // 10 + + next_link = page_content.find('a', string=re.compile('下一页')) + if next_link: + next_href = next_link.get('href', '') + if next_href: + next_page_url = f"{BASE_URL}/admin/{next_href}" + try: - if base_url and page > 1: - url = re.sub(r'page=\d+', f'page={page}', base_url) - else: - url = f"{BASE_URL}/admin/center.aspx?bz={bz}" - - resp = self._request_with_retry('get', url) - soup = BeautifulSoup(resp.text, 'html.parser') - articles = [] - - ltable = soup.find('table', {'class': 'ltable'}) - if ltable: - rows = ltable.find_all('tr')[1:] - for row in rows: - # 检查是否是"暂无记录" - if '暂无记录' in row.get_text(): - continue - - link = row.find('a', href=True) - if link: - href = link.get('href', '') - title = link.get_text().strip() - - match = re.search(r'id=(\d+)', href) - article_id = match.group(1) if match else None - - articles.append({ - 'title': title, - 'href': href, - 'article_id': article_id, - }) - - # 获取总页数 - total_pages = 1 - next_page_url = None - total_records = 0 - - page_content = soup.find(id='PageContent') - if page_content: - text = page_content.get_text() - total_match = re.search(r'共(\d+)记录', text) - if total_match: - total_records = int(total_match.group(1)) - total_pages = (total_records + 9) // 10 - - next_link = page_content.find('a', string=re.compile('下一页')) - if next_link: - next_href = next_link.get('href', '') - if next_href: - next_page_url = f"{BASE_URL}/admin/{next_href}" - - try: - self.last_total_records = int(total_records or 0) - except Exception: - self.last_total_records = 0 - return articles, total_pages, next_page_url - - except Exception as e: - self.log(f"[API] 获取列表失败: {str(e)}") - return [], 0, None + self.last_total_records = int(total_records or 0) + except Exception: + self.last_total_records = 0 + return articles, total_pages, next_page_url def get_article_attachments(self, article_href: str): """获取文章的附件列表""" - try: - if not article_href.startswith('http'): - url = f"{BASE_URL}/admin/{article_href}" - else: - url = article_href + if not article_href.startswith('http'): + url = f"{BASE_URL}/admin/{article_href}" + else: + url = article_href - resp = self._request_with_retry('get', url) - soup = BeautifulSoup(resp.text, 'html.parser') + resp = self._request_with_retry('get', url) + soup = BeautifulSoup(resp.text, 'html.parser') - attachments = [] + attachments = [] - attach_list = soup.find('div', {'class': 'attach-list2'}) - if attach_list: - items = attach_list.find_all('li') - for item in items: - download_links = item.find_all('a', onclick=re.compile(r'download\.ashx')) - for link in download_links: - onclick = link.get('onclick', '') - id_match = re.search(r'id=(\d+)', onclick) - channel_match = re.search(r'channel_id=(\d+)', onclick) - if id_match: - attach_id = id_match.group(1) - channel_id = channel_match.group(1) if channel_match else '1' - h3 = item.find('h3') - filename = h3.get_text().strip() if h3 else f'附件{attach_id}' - attachments.append({ - 'id': attach_id, - 'channel_id': channel_id, - 'filename': filename - }) - break + attach_list = soup.find('div', {'class': 'attach-list2'}) + if attach_list: + items = attach_list.find_all('li') + for item in items: + download_links = item.find_all('a', onclick=re.compile(r'download\.ashx')) + for link in download_links: + onclick = link.get('onclick', '') + id_match = re.search(r'id=(\d+)', onclick) + channel_match = re.search(r'channel_id=(\d+)', onclick) + if id_match: + attach_id = id_match.group(1) + channel_id = channel_match.group(1) if channel_match else '1' + h3 = item.find('h3') + filename = h3.get_text().strip() if h3 else f'附件{attach_id}' + attachments.append({ + 'id': attach_id, + 'channel_id': channel_id, + 'filename': filename + }) + break - return attachments - - except Exception as e: - return [] + return attachments def mark_read(self, attach_id: str, channel_id: str = '1') -> bool: """通过访问下载链接标记已读""" @@ -383,9 +377,19 @@ class APIBrowser: total_attachments = 0 page = 1 base_url = None + skipped_items = 0 + skipped_pages = 0 + consecutive_failures = 0 + max_consecutive_failures = 3 # 获取第一页 - articles, total_pages, next_url = self.get_article_list_page(bz, page) + try: + articles, total_pages, next_url = self.get_article_list_page(bz, page) + consecutive_failures = 0 + except Exception as e: + result.error_message = str(e) + self.log(f"[API] 获取第1页列表失败: {str(e)}") + return result if not articles: self.log(f"[API] '{browse_type}' 没有待处理内容") @@ -396,6 +400,8 @@ class APIBrowser: if next_url: base_url = next_url + elif total_pages > 1: + base_url = f"{BASE_URL}/admin/center.aspx?bz={bz}&page=2" total_records = int(getattr(self, "last_total_records", 0) or 0) last_report_ts = 0.0 @@ -416,22 +422,51 @@ class APIBrowser: report_progress(force=True) # 处理所有页面 - while True: + while page <= total_pages: if should_stop_callback and should_stop_callback(): self.log("[API] 收到停止信号") break + # page==1 已取过,后续页在这里获取 + if page > 1: + try: + articles, _, next_url = self.get_article_list_page(bz, page, base_url) + consecutive_failures = 0 + if next_url: + base_url = next_url + except Exception as e: + skipped_pages += 1 + consecutive_failures += 1 + self.log( + f"[API] 获取第{page}页列表失败,跳过本页(连续失败{consecutive_failures}/{max_consecutive_failures}): {str(e)}" + ) + if consecutive_failures >= max_consecutive_failures: + raise + page += 1 + continue + for article in articles: if should_stop_callback and should_stop_callback(): break title = article['title'][:30] + # 获取附件(文章详情页) + try: + attachments = self.get_article_attachments(article['href']) + consecutive_failures = 0 + except Exception as e: + skipped_items += 1 + consecutive_failures += 1 + self.log( + f"[API] 获取文章失败,跳过(连续失败{consecutive_failures}/{max_consecutive_failures}): {title} | {str(e)}" + ) + if consecutive_failures >= max_consecutive_failures: + raise + continue + total_items += 1 report_progress() - # 获取附件 - attachments = self.get_article_attachments(article['href']) - if attachments: for attach in attachments: if self.mark_read(attach['id'], attach['channel_id']): @@ -441,22 +476,16 @@ class APIBrowser: time.sleep(0.1) - # 下一页 page += 1 - if page > total_pages: - break - - articles, _, next_url = self.get_article_list_page(bz, page, base_url) - if not articles: - break - - if next_url: - base_url = next_url - time.sleep(0.2) report_progress(force=True) - self.log(f"[API] 浏览完成: {total_items} 条内容,{total_attachments} 个附件") + if skipped_items or skipped_pages: + self.log( + f"[API] 浏览完成: {total_items} 条内容,{total_attachments} 个附件(跳过 {skipped_items} 条内容,{skipped_pages} 页列表)" + ) + else: + self.log(f"[API] 浏览完成: {total_items} 条内容,{total_attachments} 个附件") result.success = True result.total_items = total_items