fix(api): 超时按单条处理避免中途结束

This commit is contained in:
2025-12-17 15:49:05 +08:00
parent 6827d11f40
commit 3f667dd21b

View File

@@ -233,105 +233,99 @@ class APIBrowser:
if not self.logged_in: if not self.logged_in:
return [], 0, None return [], 0, None
if base_url and page > 1:
url = re.sub(r'page=\d+', f'page={page}', base_url)
elif page > 1:
# 兼容兜底:若没有 next_url极少数情况下页面不提供“下一页”链接尝试直接拼 page 参数
url = f"{BASE_URL}/admin/center.aspx?bz={bz}&page={page}"
else:
url = f"{BASE_URL}/admin/center.aspx?bz={bz}"
resp = self._request_with_retry('get', url)
soup = BeautifulSoup(resp.text, 'html.parser')
articles = []
ltable = soup.find('table', {'class': 'ltable'})
if ltable:
rows = ltable.find_all('tr')[1:]
for row in rows:
# 检查是否是"暂无记录"
if '暂无记录' in row.get_text():
continue
link = row.find('a', href=True)
if link:
href = link.get('href', '')
title = link.get_text().strip()
match = re.search(r'id=(\d+)', href)
article_id = match.group(1) if match else None
articles.append({
'title': title,
'href': href,
'article_id': article_id,
})
# 获取总页数
total_pages = 1
next_page_url = None
total_records = 0
page_content = soup.find(id='PageContent')
if page_content:
text = page_content.get_text()
total_match = re.search(r'共(\d+)记录', text)
if total_match:
total_records = int(total_match.group(1))
total_pages = (total_records + 9) // 10
next_link = page_content.find('a', string=re.compile('下一页'))
if next_link:
next_href = next_link.get('href', '')
if next_href:
next_page_url = f"{BASE_URL}/admin/{next_href}"
try: try:
if base_url and page > 1: self.last_total_records = int(total_records or 0)
url = re.sub(r'page=\d+', f'page={page}', base_url) except Exception:
else: self.last_total_records = 0
url = f"{BASE_URL}/admin/center.aspx?bz={bz}" return articles, total_pages, next_page_url
resp = self._request_with_retry('get', url)
soup = BeautifulSoup(resp.text, 'html.parser')
articles = []
ltable = soup.find('table', {'class': 'ltable'})
if ltable:
rows = ltable.find_all('tr')[1:]
for row in rows:
# 检查是否是"暂无记录"
if '暂无记录' in row.get_text():
continue
link = row.find('a', href=True)
if link:
href = link.get('href', '')
title = link.get_text().strip()
match = re.search(r'id=(\d+)', href)
article_id = match.group(1) if match else None
articles.append({
'title': title,
'href': href,
'article_id': article_id,
})
# 获取总页数
total_pages = 1
next_page_url = None
total_records = 0
page_content = soup.find(id='PageContent')
if page_content:
text = page_content.get_text()
total_match = re.search(r'共(\d+)记录', text)
if total_match:
total_records = int(total_match.group(1))
total_pages = (total_records + 9) // 10
next_link = page_content.find('a', string=re.compile('下一页'))
if next_link:
next_href = next_link.get('href', '')
if next_href:
next_page_url = f"{BASE_URL}/admin/{next_href}"
try:
self.last_total_records = int(total_records or 0)
except Exception:
self.last_total_records = 0
return articles, total_pages, next_page_url
except Exception as e:
self.log(f"[API] 获取列表失败: {str(e)}")
return [], 0, None
def get_article_attachments(self, article_href: str): def get_article_attachments(self, article_href: str):
"""获取文章的附件列表""" """获取文章的附件列表"""
try: if not article_href.startswith('http'):
if not article_href.startswith('http'): url = f"{BASE_URL}/admin/{article_href}"
url = f"{BASE_URL}/admin/{article_href}" else:
else: url = article_href
url = article_href
resp = self._request_with_retry('get', url) resp = self._request_with_retry('get', url)
soup = BeautifulSoup(resp.text, 'html.parser') soup = BeautifulSoup(resp.text, 'html.parser')
attachments = [] attachments = []
attach_list = soup.find('div', {'class': 'attach-list2'}) attach_list = soup.find('div', {'class': 'attach-list2'})
if attach_list: if attach_list:
items = attach_list.find_all('li') items = attach_list.find_all('li')
for item in items: for item in items:
download_links = item.find_all('a', onclick=re.compile(r'download\.ashx')) download_links = item.find_all('a', onclick=re.compile(r'download\.ashx'))
for link in download_links: for link in download_links:
onclick = link.get('onclick', '') onclick = link.get('onclick', '')
id_match = re.search(r'id=(\d+)', onclick) id_match = re.search(r'id=(\d+)', onclick)
channel_match = re.search(r'channel_id=(\d+)', onclick) channel_match = re.search(r'channel_id=(\d+)', onclick)
if id_match: if id_match:
attach_id = id_match.group(1) attach_id = id_match.group(1)
channel_id = channel_match.group(1) if channel_match else '1' channel_id = channel_match.group(1) if channel_match else '1'
h3 = item.find('h3') h3 = item.find('h3')
filename = h3.get_text().strip() if h3 else f'附件{attach_id}' filename = h3.get_text().strip() if h3 else f'附件{attach_id}'
attachments.append({ attachments.append({
'id': attach_id, 'id': attach_id,
'channel_id': channel_id, 'channel_id': channel_id,
'filename': filename 'filename': filename
}) })
break break
return attachments return attachments
except Exception as e:
return []
def mark_read(self, attach_id: str, channel_id: str = '1') -> bool: def mark_read(self, attach_id: str, channel_id: str = '1') -> bool:
"""通过访问下载链接标记已读""" """通过访问下载链接标记已读"""
@@ -383,9 +377,19 @@ class APIBrowser:
total_attachments = 0 total_attachments = 0
page = 1 page = 1
base_url = None base_url = None
skipped_items = 0
skipped_pages = 0
consecutive_failures = 0
max_consecutive_failures = 3
# 获取第一页 # 获取第一页
articles, total_pages, next_url = self.get_article_list_page(bz, page) try:
articles, total_pages, next_url = self.get_article_list_page(bz, page)
consecutive_failures = 0
except Exception as e:
result.error_message = str(e)
self.log(f"[API] 获取第1页列表失败: {str(e)}")
return result
if not articles: if not articles:
self.log(f"[API] '{browse_type}' 没有待处理内容") self.log(f"[API] '{browse_type}' 没有待处理内容")
@@ -396,6 +400,8 @@ class APIBrowser:
if next_url: if next_url:
base_url = next_url base_url = next_url
elif total_pages > 1:
base_url = f"{BASE_URL}/admin/center.aspx?bz={bz}&page=2"
total_records = int(getattr(self, "last_total_records", 0) or 0) total_records = int(getattr(self, "last_total_records", 0) or 0)
last_report_ts = 0.0 last_report_ts = 0.0
@@ -416,22 +422,51 @@ class APIBrowser:
report_progress(force=True) report_progress(force=True)
# 处理所有页面 # 处理所有页面
while True: while page <= total_pages:
if should_stop_callback and should_stop_callback(): if should_stop_callback and should_stop_callback():
self.log("[API] 收到停止信号") self.log("[API] 收到停止信号")
break break
# page==1 已取过,后续页在这里获取
if page > 1:
try:
articles, _, next_url = self.get_article_list_page(bz, page, base_url)
consecutive_failures = 0
if next_url:
base_url = next_url
except Exception as e:
skipped_pages += 1
consecutive_failures += 1
self.log(
f"[API] 获取第{page}页列表失败,跳过本页(连续失败{consecutive_failures}/{max_consecutive_failures}: {str(e)}"
)
if consecutive_failures >= max_consecutive_failures:
raise
page += 1
continue
for article in articles: for article in articles:
if should_stop_callback and should_stop_callback(): if should_stop_callback and should_stop_callback():
break break
title = article['title'][:30] title = article['title'][:30]
# 获取附件(文章详情页)
try:
attachments = self.get_article_attachments(article['href'])
consecutive_failures = 0
except Exception as e:
skipped_items += 1
consecutive_failures += 1
self.log(
f"[API] 获取文章失败,跳过(连续失败{consecutive_failures}/{max_consecutive_failures}: {title} | {str(e)}"
)
if consecutive_failures >= max_consecutive_failures:
raise
continue
total_items += 1 total_items += 1
report_progress() report_progress()
# 获取附件
attachments = self.get_article_attachments(article['href'])
if attachments: if attachments:
for attach in attachments: for attach in attachments:
if self.mark_read(attach['id'], attach['channel_id']): if self.mark_read(attach['id'], attach['channel_id']):
@@ -441,22 +476,16 @@ class APIBrowser:
time.sleep(0.1) time.sleep(0.1)
# 下一页
page += 1 page += 1
if page > total_pages:
break
articles, _, next_url = self.get_article_list_page(bz, page, base_url)
if not articles:
break
if next_url:
base_url = next_url
time.sleep(0.2) time.sleep(0.2)
report_progress(force=True) report_progress(force=True)
self.log(f"[API] 浏览完成: {total_items} 条内容,{total_attachments} 个附件") if skipped_items or skipped_pages:
self.log(
f"[API] 浏览完成: {total_items} 条内容,{total_attachments} 个附件(跳过 {skipped_items} 条内容,{skipped_pages} 页列表)"
)
else:
self.log(f"[API] 浏览完成: {total_items} 条内容,{total_attachments} 个附件")
result.success = True result.success = True
result.total_items = total_items result.total_items = total_items