fix(api): 超时按单条处理避免中途结束
This commit is contained in:
245
api_browser.py
245
api_browser.py
@@ -233,105 +233,99 @@ class APIBrowser:
|
|||||||
if not self.logged_in:
|
if not self.logged_in:
|
||||||
return [], 0, None
|
return [], 0, None
|
||||||
|
|
||||||
|
if base_url and page > 1:
|
||||||
|
url = re.sub(r'page=\d+', f'page={page}', base_url)
|
||||||
|
elif page > 1:
|
||||||
|
# 兼容兜底:若没有 next_url(极少数情况下页面不提供“下一页”链接),尝试直接拼 page 参数
|
||||||
|
url = f"{BASE_URL}/admin/center.aspx?bz={bz}&page={page}"
|
||||||
|
else:
|
||||||
|
url = f"{BASE_URL}/admin/center.aspx?bz={bz}"
|
||||||
|
|
||||||
|
resp = self._request_with_retry('get', url)
|
||||||
|
soup = BeautifulSoup(resp.text, 'html.parser')
|
||||||
|
articles = []
|
||||||
|
|
||||||
|
ltable = soup.find('table', {'class': 'ltable'})
|
||||||
|
if ltable:
|
||||||
|
rows = ltable.find_all('tr')[1:]
|
||||||
|
for row in rows:
|
||||||
|
# 检查是否是"暂无记录"
|
||||||
|
if '暂无记录' in row.get_text():
|
||||||
|
continue
|
||||||
|
|
||||||
|
link = row.find('a', href=True)
|
||||||
|
if link:
|
||||||
|
href = link.get('href', '')
|
||||||
|
title = link.get_text().strip()
|
||||||
|
|
||||||
|
match = re.search(r'id=(\d+)', href)
|
||||||
|
article_id = match.group(1) if match else None
|
||||||
|
|
||||||
|
articles.append({
|
||||||
|
'title': title,
|
||||||
|
'href': href,
|
||||||
|
'article_id': article_id,
|
||||||
|
})
|
||||||
|
|
||||||
|
# 获取总页数
|
||||||
|
total_pages = 1
|
||||||
|
next_page_url = None
|
||||||
|
total_records = 0
|
||||||
|
|
||||||
|
page_content = soup.find(id='PageContent')
|
||||||
|
if page_content:
|
||||||
|
text = page_content.get_text()
|
||||||
|
total_match = re.search(r'共(\d+)记录', text)
|
||||||
|
if total_match:
|
||||||
|
total_records = int(total_match.group(1))
|
||||||
|
total_pages = (total_records + 9) // 10
|
||||||
|
|
||||||
|
next_link = page_content.find('a', string=re.compile('下一页'))
|
||||||
|
if next_link:
|
||||||
|
next_href = next_link.get('href', '')
|
||||||
|
if next_href:
|
||||||
|
next_page_url = f"{BASE_URL}/admin/{next_href}"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if base_url and page > 1:
|
self.last_total_records = int(total_records or 0)
|
||||||
url = re.sub(r'page=\d+', f'page={page}', base_url)
|
except Exception:
|
||||||
else:
|
self.last_total_records = 0
|
||||||
url = f"{BASE_URL}/admin/center.aspx?bz={bz}"
|
return articles, total_pages, next_page_url
|
||||||
|
|
||||||
resp = self._request_with_retry('get', url)
|
|
||||||
soup = BeautifulSoup(resp.text, 'html.parser')
|
|
||||||
articles = []
|
|
||||||
|
|
||||||
ltable = soup.find('table', {'class': 'ltable'})
|
|
||||||
if ltable:
|
|
||||||
rows = ltable.find_all('tr')[1:]
|
|
||||||
for row in rows:
|
|
||||||
# 检查是否是"暂无记录"
|
|
||||||
if '暂无记录' in row.get_text():
|
|
||||||
continue
|
|
||||||
|
|
||||||
link = row.find('a', href=True)
|
|
||||||
if link:
|
|
||||||
href = link.get('href', '')
|
|
||||||
title = link.get_text().strip()
|
|
||||||
|
|
||||||
match = re.search(r'id=(\d+)', href)
|
|
||||||
article_id = match.group(1) if match else None
|
|
||||||
|
|
||||||
articles.append({
|
|
||||||
'title': title,
|
|
||||||
'href': href,
|
|
||||||
'article_id': article_id,
|
|
||||||
})
|
|
||||||
|
|
||||||
# 获取总页数
|
|
||||||
total_pages = 1
|
|
||||||
next_page_url = None
|
|
||||||
total_records = 0
|
|
||||||
|
|
||||||
page_content = soup.find(id='PageContent')
|
|
||||||
if page_content:
|
|
||||||
text = page_content.get_text()
|
|
||||||
total_match = re.search(r'共(\d+)记录', text)
|
|
||||||
if total_match:
|
|
||||||
total_records = int(total_match.group(1))
|
|
||||||
total_pages = (total_records + 9) // 10
|
|
||||||
|
|
||||||
next_link = page_content.find('a', string=re.compile('下一页'))
|
|
||||||
if next_link:
|
|
||||||
next_href = next_link.get('href', '')
|
|
||||||
if next_href:
|
|
||||||
next_page_url = f"{BASE_URL}/admin/{next_href}"
|
|
||||||
|
|
||||||
try:
|
|
||||||
self.last_total_records = int(total_records or 0)
|
|
||||||
except Exception:
|
|
||||||
self.last_total_records = 0
|
|
||||||
return articles, total_pages, next_page_url
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.log(f"[API] 获取列表失败: {str(e)}")
|
|
||||||
return [], 0, None
|
|
||||||
|
|
||||||
def get_article_attachments(self, article_href: str):
|
def get_article_attachments(self, article_href: str):
|
||||||
"""获取文章的附件列表"""
|
"""获取文章的附件列表"""
|
||||||
try:
|
if not article_href.startswith('http'):
|
||||||
if not article_href.startswith('http'):
|
url = f"{BASE_URL}/admin/{article_href}"
|
||||||
url = f"{BASE_URL}/admin/{article_href}"
|
else:
|
||||||
else:
|
url = article_href
|
||||||
url = article_href
|
|
||||||
|
|
||||||
resp = self._request_with_retry('get', url)
|
resp = self._request_with_retry('get', url)
|
||||||
soup = BeautifulSoup(resp.text, 'html.parser')
|
soup = BeautifulSoup(resp.text, 'html.parser')
|
||||||
|
|
||||||
attachments = []
|
attachments = []
|
||||||
|
|
||||||
attach_list = soup.find('div', {'class': 'attach-list2'})
|
attach_list = soup.find('div', {'class': 'attach-list2'})
|
||||||
if attach_list:
|
if attach_list:
|
||||||
items = attach_list.find_all('li')
|
items = attach_list.find_all('li')
|
||||||
for item in items:
|
for item in items:
|
||||||
download_links = item.find_all('a', onclick=re.compile(r'download\.ashx'))
|
download_links = item.find_all('a', onclick=re.compile(r'download\.ashx'))
|
||||||
for link in download_links:
|
for link in download_links:
|
||||||
onclick = link.get('onclick', '')
|
onclick = link.get('onclick', '')
|
||||||
id_match = re.search(r'id=(\d+)', onclick)
|
id_match = re.search(r'id=(\d+)', onclick)
|
||||||
channel_match = re.search(r'channel_id=(\d+)', onclick)
|
channel_match = re.search(r'channel_id=(\d+)', onclick)
|
||||||
if id_match:
|
if id_match:
|
||||||
attach_id = id_match.group(1)
|
attach_id = id_match.group(1)
|
||||||
channel_id = channel_match.group(1) if channel_match else '1'
|
channel_id = channel_match.group(1) if channel_match else '1'
|
||||||
h3 = item.find('h3')
|
h3 = item.find('h3')
|
||||||
filename = h3.get_text().strip() if h3 else f'附件{attach_id}'
|
filename = h3.get_text().strip() if h3 else f'附件{attach_id}'
|
||||||
attachments.append({
|
attachments.append({
|
||||||
'id': attach_id,
|
'id': attach_id,
|
||||||
'channel_id': channel_id,
|
'channel_id': channel_id,
|
||||||
'filename': filename
|
'filename': filename
|
||||||
})
|
})
|
||||||
break
|
break
|
||||||
|
|
||||||
return attachments
|
return attachments
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
return []
|
|
||||||
|
|
||||||
def mark_read(self, attach_id: str, channel_id: str = '1') -> bool:
|
def mark_read(self, attach_id: str, channel_id: str = '1') -> bool:
|
||||||
"""通过访问下载链接标记已读"""
|
"""通过访问下载链接标记已读"""
|
||||||
@@ -383,9 +377,19 @@ class APIBrowser:
|
|||||||
total_attachments = 0
|
total_attachments = 0
|
||||||
page = 1
|
page = 1
|
||||||
base_url = None
|
base_url = None
|
||||||
|
skipped_items = 0
|
||||||
|
skipped_pages = 0
|
||||||
|
consecutive_failures = 0
|
||||||
|
max_consecutive_failures = 3
|
||||||
|
|
||||||
# 获取第一页
|
# 获取第一页
|
||||||
articles, total_pages, next_url = self.get_article_list_page(bz, page)
|
try:
|
||||||
|
articles, total_pages, next_url = self.get_article_list_page(bz, page)
|
||||||
|
consecutive_failures = 0
|
||||||
|
except Exception as e:
|
||||||
|
result.error_message = str(e)
|
||||||
|
self.log(f"[API] 获取第1页列表失败: {str(e)}")
|
||||||
|
return result
|
||||||
|
|
||||||
if not articles:
|
if not articles:
|
||||||
self.log(f"[API] '{browse_type}' 没有待处理内容")
|
self.log(f"[API] '{browse_type}' 没有待处理内容")
|
||||||
@@ -396,6 +400,8 @@ class APIBrowser:
|
|||||||
|
|
||||||
if next_url:
|
if next_url:
|
||||||
base_url = next_url
|
base_url = next_url
|
||||||
|
elif total_pages > 1:
|
||||||
|
base_url = f"{BASE_URL}/admin/center.aspx?bz={bz}&page=2"
|
||||||
|
|
||||||
total_records = int(getattr(self, "last_total_records", 0) or 0)
|
total_records = int(getattr(self, "last_total_records", 0) or 0)
|
||||||
last_report_ts = 0.0
|
last_report_ts = 0.0
|
||||||
@@ -416,22 +422,51 @@ class APIBrowser:
|
|||||||
report_progress(force=True)
|
report_progress(force=True)
|
||||||
|
|
||||||
# 处理所有页面
|
# 处理所有页面
|
||||||
while True:
|
while page <= total_pages:
|
||||||
if should_stop_callback and should_stop_callback():
|
if should_stop_callback and should_stop_callback():
|
||||||
self.log("[API] 收到停止信号")
|
self.log("[API] 收到停止信号")
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# page==1 已取过,后续页在这里获取
|
||||||
|
if page > 1:
|
||||||
|
try:
|
||||||
|
articles, _, next_url = self.get_article_list_page(bz, page, base_url)
|
||||||
|
consecutive_failures = 0
|
||||||
|
if next_url:
|
||||||
|
base_url = next_url
|
||||||
|
except Exception as e:
|
||||||
|
skipped_pages += 1
|
||||||
|
consecutive_failures += 1
|
||||||
|
self.log(
|
||||||
|
f"[API] 获取第{page}页列表失败,跳过本页(连续失败{consecutive_failures}/{max_consecutive_failures}): {str(e)}"
|
||||||
|
)
|
||||||
|
if consecutive_failures >= max_consecutive_failures:
|
||||||
|
raise
|
||||||
|
page += 1
|
||||||
|
continue
|
||||||
|
|
||||||
for article in articles:
|
for article in articles:
|
||||||
if should_stop_callback and should_stop_callback():
|
if should_stop_callback and should_stop_callback():
|
||||||
break
|
break
|
||||||
|
|
||||||
title = article['title'][:30]
|
title = article['title'][:30]
|
||||||
|
# 获取附件(文章详情页)
|
||||||
|
try:
|
||||||
|
attachments = self.get_article_attachments(article['href'])
|
||||||
|
consecutive_failures = 0
|
||||||
|
except Exception as e:
|
||||||
|
skipped_items += 1
|
||||||
|
consecutive_failures += 1
|
||||||
|
self.log(
|
||||||
|
f"[API] 获取文章失败,跳过(连续失败{consecutive_failures}/{max_consecutive_failures}): {title} | {str(e)}"
|
||||||
|
)
|
||||||
|
if consecutive_failures >= max_consecutive_failures:
|
||||||
|
raise
|
||||||
|
continue
|
||||||
|
|
||||||
total_items += 1
|
total_items += 1
|
||||||
report_progress()
|
report_progress()
|
||||||
|
|
||||||
# 获取附件
|
|
||||||
attachments = self.get_article_attachments(article['href'])
|
|
||||||
|
|
||||||
if attachments:
|
if attachments:
|
||||||
for attach in attachments:
|
for attach in attachments:
|
||||||
if self.mark_read(attach['id'], attach['channel_id']):
|
if self.mark_read(attach['id'], attach['channel_id']):
|
||||||
@@ -441,22 +476,16 @@ class APIBrowser:
|
|||||||
|
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
|
|
||||||
# 下一页
|
|
||||||
page += 1
|
page += 1
|
||||||
if page > total_pages:
|
|
||||||
break
|
|
||||||
|
|
||||||
articles, _, next_url = self.get_article_list_page(bz, page, base_url)
|
|
||||||
if not articles:
|
|
||||||
break
|
|
||||||
|
|
||||||
if next_url:
|
|
||||||
base_url = next_url
|
|
||||||
|
|
||||||
time.sleep(0.2)
|
time.sleep(0.2)
|
||||||
|
|
||||||
report_progress(force=True)
|
report_progress(force=True)
|
||||||
self.log(f"[API] 浏览完成: {total_items} 条内容,{total_attachments} 个附件")
|
if skipped_items or skipped_pages:
|
||||||
|
self.log(
|
||||||
|
f"[API] 浏览完成: {total_items} 条内容,{total_attachments} 个附件(跳过 {skipped_items} 条内容,{skipped_pages} 页列表)"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.log(f"[API] 浏览完成: {total_items} 条内容,{total_attachments} 个附件")
|
||||||
|
|
||||||
result.success = True
|
result.success = True
|
||||||
result.total_items = total_items
|
result.total_items = total_items
|
||||||
|
|||||||
Reference in New Issue
Block a user