#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ API 浏览器 - 用纯 HTTP 请求实现浏览功能 比 Playwright 快 30-60 倍 """ import requests from bs4 import BeautifulSoup import re import time from typing import Optional, Callable from dataclasses import dataclass BASE_URL = "https://postoa.aidunsoft.com" @dataclass class APIBrowseResult: """API 浏览结果""" success: bool total_items: int = 0 total_attachments: int = 0 error_message: str = "" class APIBrowser: """API 浏览器 - 使用纯 HTTP 请求实现浏览""" def __init__(self, log_callback: Optional[Callable] = None, proxy_config: Optional[dict] = None): self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', }) self.logged_in = False self.log_callback = log_callback self.stop_flag = False # 设置代理 if proxy_config and proxy_config.get("server"): proxy_server = proxy_config["server"] self.session.proxies = { "http": proxy_server, "https": proxy_server } self.proxy_server = proxy_server else: self.proxy_server = None def log(self, message: str): """记录日志""" if self.log_callback: self.log_callback(message) def save_cookies_for_playwright(self, username: str): """保存cookies供Playwright使用""" import os import json import hashlib cookies_dir = '/app/data/cookies' os.makedirs(cookies_dir, exist_ok=True) # 用用户名的hash作为文件名 filename = hashlib.md5(username.encode()).hexdigest() + '.json' cookies_path = os.path.join(cookies_dir, filename) try: # 获取requests session的cookies cookies_list = [] for cookie in self.session.cookies: cookies_list.append({ 'name': cookie.name, 'value': cookie.value, 'domain': cookie.domain or 'postoa.aidunsoft.com', 'path': cookie.path or '/', }) # Playwright storage_state 格式 storage_state = { 'cookies': cookies_list, 'origins': [] } with open(cookies_path, 'w', encoding='utf-8') as f: json.dump(storage_state, f) self.log(f"[API] Cookies已保存供截图使用") return True except Exception as e: self.log(f"[API] 保存cookies失败: {e}") return False def _request_with_retry(self, method, url, max_retries=3, retry_delay=1, **kwargs): """带重试机制的请求方法""" kwargs.setdefault('timeout', 10) last_error = None for attempt in range(1, max_retries + 1): try: if method.lower() == 'get': resp = self.session.get(url, **kwargs) else: resp = self.session.post(url, **kwargs) return resp except Exception as e: last_error = e if attempt < max_retries: self.log(f"[API] 请求超时,{retry_delay}秒后重试 ({attempt}/{max_retries})...") import time time.sleep(retry_delay) else: self.log(f"[API] 请求失败,已重试{max_retries}次: {str(e)}") raise last_error def _get_aspnet_fields(self, soup): """获取 ASP.NET 隐藏字段""" fields = {} for name in ['__VIEWSTATE', '__VIEWSTATEGENERATOR', '__EVENTVALIDATION']: field = soup.find('input', {'name': name}) if field: fields[name] = field.get('value', '') return fields def login(self, username: str, password: str) -> bool: """登录""" self.log(f"[API] 登录: {username}") try: login_url = f"{BASE_URL}/admin/login.aspx" resp = self._request_with_retry('get', login_url) soup = BeautifulSoup(resp.text, 'html.parser') fields = self._get_aspnet_fields(soup) data = fields.copy() data['txtUserName'] = username data['txtPassword'] = password data['btnSubmit'] = '登 录' resp = self._request_with_retry( 'post', login_url, data=data, headers={ 'Content-Type': 'application/x-www-form-urlencoded', 'Origin': BASE_URL, 'Referer': login_url, }, allow_redirects=True ) if 'index.aspx' in resp.url: self.logged_in = True self.log(f"[API] 登录成功") return True else: soup = BeautifulSoup(resp.text, 'html.parser') error = soup.find(id='lblMsg') error_msg = error.get_text().strip() if error else '未知错误' self.log(f"[API] 登录失败: {error_msg}") return False except Exception as e: self.log(f"[API] 登录异常: {str(e)}") return False def get_article_list_page(self, bz: int = 2, page: int = 1, base_url: str = None): """获取单页文章列表""" if not self.logged_in: return [], 0, None try: if base_url and page > 1: url = re.sub(r'page=\d+', f'page={page}', base_url) else: url = f"{BASE_URL}/admin/center.aspx?bz={bz}" resp = self._request_with_retry('get', url) soup = BeautifulSoup(resp.text, 'html.parser') articles = [] ltable = soup.find('table', {'class': 'ltable'}) if ltable: rows = ltable.find_all('tr')[1:] for row in rows: # 检查是否是"暂无记录" if '暂无记录' in row.get_text(): continue link = row.find('a', href=True) if link: href = link.get('href', '') title = link.get_text().strip() match = re.search(r'id=(\d+)', href) article_id = match.group(1) if match else None articles.append({ 'title': title, 'href': href, 'article_id': article_id, }) # 获取总页数 total_pages = 1 next_page_url = None page_content = soup.find(id='PageContent') if page_content: text = page_content.get_text() total_match = re.search(r'共(\d+)记录', text) if total_match: total_records = int(total_match.group(1)) total_pages = (total_records + 9) // 10 next_link = page_content.find('a', string=re.compile('下一页')) if next_link: next_href = next_link.get('href', '') if next_href: next_page_url = f"{BASE_URL}/admin/{next_href}" return articles, total_pages, next_page_url except Exception as e: self.log(f"[API] 获取列表失败: {str(e)}") return [], 0, None def get_article_attachments(self, article_href: str): """获取文章的附件列表""" try: if not article_href.startswith('http'): url = f"{BASE_URL}/admin/{article_href}" else: url = article_href resp = self._request_with_retry('get', url) soup = BeautifulSoup(resp.text, 'html.parser') attachments = [] attach_list = soup.find('div', {'class': 'attach-list2'}) if attach_list: items = attach_list.find_all('li') for item in items: download_links = item.find_all('a', onclick=re.compile(r'download\.ashx')) for link in download_links: onclick = link.get('onclick', '') id_match = re.search(r'id=(\d+)', onclick) channel_match = re.search(r'channel_id=(\d+)', onclick) if id_match: attach_id = id_match.group(1) channel_id = channel_match.group(1) if channel_match else '1' h3 = item.find('h3') filename = h3.get_text().strip() if h3 else f'附件{attach_id}' attachments.append({ 'id': attach_id, 'channel_id': channel_id, 'filename': filename }) break return attachments except Exception as e: return [] def mark_read(self, attach_id: str, channel_id: str = '1') -> bool: """通过访问下载链接标记已读""" download_url = f"{BASE_URL}/tools/download.ashx?site=main&id={attach_id}&channel_id={channel_id}" try: resp = self._request_with_retry("get", download_url, stream=True) resp.close() return resp.status_code == 200 except: return False def browse_content(self, browse_type: str, should_stop_callback: Optional[Callable] = None) -> APIBrowseResult: """ 浏览内容并标记已读 Args: browse_type: 浏览类型 (应读/注册前未读) should_stop_callback: 检查是否应该停止的回调函数 Returns: 浏览结果 """ result = APIBrowseResult(success=False) if not self.logged_in: result.error_message = "未登录" return result # 根据浏览类型确定 bz 参数 # 网页实际选项: 0=注册前未读, 1=已读, 2=应读 # 前端选项: 注册前未读, 应读, 未读, 已读 if '注册前' in browse_type: bz = 0 # 注册前未读 elif browse_type == '已读': bz = 1 # 已读 else: bz = 2 # 应读、未读 都映射到 bz=2 self.log(f"[API] 开始浏览 '{browse_type}' (bz={bz})...") try: total_items = 0 total_attachments = 0 page = 1 base_url = None # 获取第一页 articles, total_pages, next_url = self.get_article_list_page(bz, page) if not articles: self.log(f"[API] '{browse_type}' 没有待处理内容") result.success = True return result self.log(f"[API] 共 {total_pages} 页,开始处理...") if next_url: base_url = next_url # 处理所有页面 while True: if should_stop_callback and should_stop_callback(): self.log("[API] 收到停止信号") break for article in articles: if should_stop_callback and should_stop_callback(): break title = article['title'][:30] total_items += 1 # 获取附件 attachments = self.get_article_attachments(article['href']) if attachments: for attach in attachments: if self.mark_read(attach['id'], attach['channel_id']): total_attachments += 1 self.log(f"[API] [{total_items}] {title} - {len(attachments)}个附件") time.sleep(0.1) # 下一页 page += 1 if page > total_pages: break articles, _, next_url = self.get_article_list_page(bz, page, base_url) if not articles: break if next_url: base_url = next_url time.sleep(0.2) self.log(f"[API] 浏览完成: {total_items} 条内容,{total_attachments} 个附件") result.success = True result.total_items = total_items result.total_attachments = total_attachments return result except Exception as e: result.error_message = str(e) self.log(f"[API] 浏览出错: {str(e)}") return result def close(self): """关闭会话""" try: self.session.close() except: pass