524 lines
18 KiB
Python
Executable File
524 lines
18 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
API 浏览器 - 用纯 HTTP 请求实现浏览功能
|
||
比 Playwright 快 30-60 倍
|
||
"""
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
import re
|
||
import time
|
||
import atexit
|
||
import weakref
|
||
from typing import Optional, Callable
|
||
from dataclasses import dataclass
|
||
from urllib.parse import urlsplit
|
||
|
||
from app_config import get_config
|
||
|
||
config = get_config()
|
||
|
||
BASE_URL = getattr(config, "ZSGL_BASE_URL", "https://postoa.aidunsoft.com")
|
||
LOGIN_URL = getattr(config, "ZSGL_LOGIN_URL", f"{BASE_URL}/admin/login.aspx")
|
||
INDEX_URL_PATTERN = getattr(config, "ZSGL_INDEX_URL_PATTERN", "index.aspx")
|
||
COOKIES_DIR = getattr(config, "COOKIES_DIR", "data/cookies")
|
||
|
||
_cookie_domain_fallback = urlsplit(BASE_URL).hostname or "postoa.aidunsoft.com"
|
||
|
||
_api_browser_instances: "weakref.WeakSet[APIBrowser]" = weakref.WeakSet()
|
||
|
||
|
||
def _cleanup_api_browser_instances():
|
||
"""进程退出时清理残留的API浏览器实例(弱引用,不阻止GC)"""
|
||
for inst in list(_api_browser_instances):
|
||
try:
|
||
inst.close()
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
atexit.register(_cleanup_api_browser_instances)
|
||
|
||
|
||
@dataclass
|
||
class APIBrowseResult:
|
||
"""API 浏览结果"""
|
||
success: bool
|
||
total_items: int = 0
|
||
total_attachments: int = 0
|
||
error_message: str = ""
|
||
|
||
|
||
class APIBrowser:
|
||
"""API 浏览器 - 使用纯 HTTP 请求实现浏览"""
|
||
|
||
def __init__(self, log_callback: Optional[Callable] = None, proxy_config: Optional[dict] = None):
|
||
self.session = requests.Session()
|
||
self.session.headers.update({
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
||
})
|
||
self.logged_in = False
|
||
self.log_callback = log_callback
|
||
self.stop_flag = False
|
||
self._closed = False # 防止重复关闭
|
||
self.last_total_records = 0
|
||
|
||
# 设置代理
|
||
if proxy_config and proxy_config.get("server"):
|
||
proxy_server = proxy_config["server"]
|
||
self.session.proxies = {
|
||
"http": proxy_server,
|
||
"https": proxy_server
|
||
}
|
||
self.proxy_server = proxy_server
|
||
else:
|
||
self.proxy_server = None
|
||
|
||
_api_browser_instances.add(self)
|
||
|
||
def log(self, message: str):
|
||
"""记录日志"""
|
||
if self.log_callback:
|
||
self.log_callback(message)
|
||
def save_cookies_for_playwright(self, username: str):
|
||
"""保存cookies供Playwright使用"""
|
||
import os
|
||
import json
|
||
import hashlib
|
||
|
||
os.makedirs(COOKIES_DIR, exist_ok=True)
|
||
|
||
# 安全修复:使用SHA256代替MD5作为文件名哈希
|
||
filename = hashlib.sha256(username.encode()).hexdigest()[:32] + '.json'
|
||
cookies_path = os.path.join(COOKIES_DIR, filename)
|
||
|
||
try:
|
||
# 获取requests session的cookies
|
||
cookies_list = []
|
||
for cookie in self.session.cookies:
|
||
cookies_list.append({
|
||
'name': cookie.name,
|
||
'value': cookie.value,
|
||
'domain': cookie.domain or _cookie_domain_fallback,
|
||
'path': cookie.path or '/',
|
||
})
|
||
|
||
# Playwright storage_state 格式
|
||
storage_state = {
|
||
'cookies': cookies_list,
|
||
'origins': []
|
||
}
|
||
|
||
with open(cookies_path, 'w', encoding='utf-8') as f:
|
||
json.dump(storage_state, f)
|
||
|
||
self.log(f"[API] Cookies已保存供截图使用")
|
||
return True
|
||
except Exception as e:
|
||
self.log(f"[API] 保存cookies失败: {e}")
|
||
return False
|
||
|
||
|
||
|
||
def _request_with_retry(self, method, url, max_retries=3, retry_delay=1, **kwargs):
|
||
"""带重试机制的请求方法"""
|
||
kwargs.setdefault('timeout', 5)
|
||
last_error = None
|
||
|
||
for attempt in range(1, max_retries + 1):
|
||
try:
|
||
if method.lower() == 'get':
|
||
resp = self.session.get(url, **kwargs)
|
||
else:
|
||
resp = self.session.post(url, **kwargs)
|
||
return resp
|
||
except Exception as e:
|
||
last_error = e
|
||
if attempt < max_retries:
|
||
self.log(f"[API] 请求超时,{retry_delay}秒后重试 ({attempt}/{max_retries})...")
|
||
import time
|
||
time.sleep(retry_delay)
|
||
else:
|
||
self.log(f"[API] 请求失败,已重试{max_retries}次: {str(e)}")
|
||
|
||
raise last_error
|
||
|
||
def _get_aspnet_fields(self, soup):
|
||
"""获取 ASP.NET 隐藏字段"""
|
||
fields = {}
|
||
for name in ['__VIEWSTATE', '__VIEWSTATEGENERATOR', '__EVENTVALIDATION']:
|
||
field = soup.find('input', {'name': name})
|
||
if field:
|
||
fields[name] = field.get('value', '')
|
||
return fields
|
||
|
||
def get_real_name(self) -> Optional[str]:
|
||
"""
|
||
获取用户真实姓名
|
||
从 center.aspx 页面解析姓名信息
|
||
返回: 姓名字符串,失败返回 None
|
||
"""
|
||
if not self.logged_in:
|
||
return None
|
||
|
||
try:
|
||
url = f"{BASE_URL}/admin/center.aspx"
|
||
resp = self._request_with_retry('get', url)
|
||
soup = BeautifulSoup(resp.text, 'html.parser')
|
||
|
||
# 查找包含"姓名:"的元素
|
||
# 页面格式: <li><p>姓名:喻勇祥(19174616018) 人力资源编码: ...</p></li>
|
||
nlist = soup.find('div', {'class': 'nlist-5'})
|
||
if nlist:
|
||
first_li = nlist.find('li')
|
||
if first_li:
|
||
text = first_li.get_text()
|
||
# 解析姓名:格式为 "姓名:XXX(手机号)"
|
||
match = re.search(r'姓名[::]\s*([^\((]+)', text)
|
||
if match:
|
||
real_name = match.group(1).strip()
|
||
if real_name:
|
||
return real_name
|
||
return None
|
||
except Exception as e:
|
||
return None
|
||
|
||
def login(self, username: str, password: str) -> bool:
|
||
"""登录"""
|
||
self.log(f"[API] 登录: {username}")
|
||
|
||
try:
|
||
resp = self._request_with_retry('get', LOGIN_URL)
|
||
|
||
soup = BeautifulSoup(resp.text, 'html.parser')
|
||
fields = self._get_aspnet_fields(soup)
|
||
|
||
data = fields.copy()
|
||
data['txtUserName'] = username
|
||
data['txtPassword'] = password
|
||
data['btnSubmit'] = '登 录'
|
||
|
||
resp = self._request_with_retry(
|
||
'post',
|
||
LOGIN_URL,
|
||
data=data,
|
||
headers={
|
||
'Content-Type': 'application/x-www-form-urlencoded',
|
||
'Origin': BASE_URL,
|
||
'Referer': LOGIN_URL,
|
||
},
|
||
allow_redirects=True
|
||
)
|
||
|
||
if INDEX_URL_PATTERN in resp.url:
|
||
self.logged_in = True
|
||
self.log(f"[API] 登录成功")
|
||
return True
|
||
else:
|
||
soup = BeautifulSoup(resp.text, 'html.parser')
|
||
error = soup.find(id='lblMsg')
|
||
error_msg = error.get_text().strip() if error else '未知错误'
|
||
self.log(f"[API] 登录失败: {error_msg}")
|
||
return False
|
||
|
||
except Exception as e:
|
||
self.log(f"[API] 登录异常: {str(e)}")
|
||
return False
|
||
|
||
def get_article_list_page(self, bz: int = 2, page: int = 1, base_url: str = None):
|
||
"""获取单页文章列表"""
|
||
if not self.logged_in:
|
||
return [], 0, None
|
||
|
||
if base_url and page > 1:
|
||
url = re.sub(r'page=\d+', f'page={page}', base_url)
|
||
elif page > 1:
|
||
# 兼容兜底:若没有 next_url(极少数情况下页面不提供“下一页”链接),尝试直接拼 page 参数
|
||
url = f"{BASE_URL}/admin/center.aspx?bz={bz}&page={page}"
|
||
else:
|
||
url = f"{BASE_URL}/admin/center.aspx?bz={bz}"
|
||
|
||
resp = self._request_with_retry('get', url)
|
||
soup = BeautifulSoup(resp.text, 'html.parser')
|
||
articles = []
|
||
|
||
ltable = soup.find('table', {'class': 'ltable'})
|
||
if ltable:
|
||
rows = ltable.find_all('tr')[1:]
|
||
for row in rows:
|
||
# 检查是否是"暂无记录"
|
||
if '暂无记录' in row.get_text():
|
||
continue
|
||
|
||
link = row.find('a', href=True)
|
||
if link:
|
||
href = link.get('href', '')
|
||
title = link.get_text().strip()
|
||
|
||
match = re.search(r'id=(\d+)', href)
|
||
article_id = match.group(1) if match else None
|
||
|
||
articles.append({
|
||
'title': title,
|
||
'href': href,
|
||
'article_id': article_id,
|
||
})
|
||
|
||
# 获取总页数
|
||
total_pages = 1
|
||
next_page_url = None
|
||
total_records = 0
|
||
|
||
page_content = soup.find(id='PageContent')
|
||
if page_content:
|
||
text = page_content.get_text()
|
||
total_match = re.search(r'共(\d+)记录', text)
|
||
if total_match:
|
||
total_records = int(total_match.group(1))
|
||
total_pages = (total_records + 9) // 10
|
||
|
||
next_link = page_content.find('a', string=re.compile('下一页'))
|
||
if next_link:
|
||
next_href = next_link.get('href', '')
|
||
if next_href:
|
||
next_page_url = f"{BASE_URL}/admin/{next_href}"
|
||
|
||
try:
|
||
self.last_total_records = int(total_records or 0)
|
||
except Exception:
|
||
self.last_total_records = 0
|
||
return articles, total_pages, next_page_url
|
||
|
||
def get_article_attachments(self, article_href: str):
|
||
"""获取文章的附件列表"""
|
||
if not article_href.startswith('http'):
|
||
url = f"{BASE_URL}/admin/{article_href}"
|
||
else:
|
||
url = article_href
|
||
|
||
resp = self._request_with_retry('get', url)
|
||
soup = BeautifulSoup(resp.text, 'html.parser')
|
||
|
||
attachments = []
|
||
|
||
attach_list = soup.find('div', {'class': 'attach-list2'})
|
||
if attach_list:
|
||
items = attach_list.find_all('li')
|
||
for item in items:
|
||
download_links = item.find_all('a', onclick=re.compile(r'download\.ashx'))
|
||
for link in download_links:
|
||
onclick = link.get('onclick', '')
|
||
id_match = re.search(r'id=(\d+)', onclick)
|
||
channel_match = re.search(r'channel_id=(\d+)', onclick)
|
||
if id_match:
|
||
attach_id = id_match.group(1)
|
||
channel_id = channel_match.group(1) if channel_match else '1'
|
||
h3 = item.find('h3')
|
||
filename = h3.get_text().strip() if h3 else f'附件{attach_id}'
|
||
attachments.append({
|
||
'id': attach_id,
|
||
'channel_id': channel_id,
|
||
'filename': filename
|
||
})
|
||
break
|
||
|
||
return attachments
|
||
|
||
def mark_read(self, attach_id: str, channel_id: str = '1') -> bool:
|
||
"""通过访问下载链接标记已读"""
|
||
download_url = f"{BASE_URL}/tools/download.ashx?site=main&id={attach_id}&channel_id={channel_id}"
|
||
|
||
try:
|
||
resp = self._request_with_retry("get", download_url, stream=True)
|
||
resp.close()
|
||
return resp.status_code == 200
|
||
except:
|
||
return False
|
||
|
||
def browse_content(
|
||
self,
|
||
browse_type: str,
|
||
should_stop_callback: Optional[Callable] = None,
|
||
progress_callback: Optional[Callable] = None,
|
||
) -> APIBrowseResult:
|
||
"""
|
||
浏览内容并标记已读
|
||
|
||
Args:
|
||
browse_type: 浏览类型 (应读/注册前未读)
|
||
should_stop_callback: 检查是否应该停止的回调函数
|
||
progress_callback: 进度回调(可选),用于实时上报已浏览内容数量
|
||
|
||
Returns:
|
||
浏览结果
|
||
"""
|
||
result = APIBrowseResult(success=False)
|
||
|
||
if not self.logged_in:
|
||
result.error_message = "未登录"
|
||
return result
|
||
|
||
# 根据浏览类型确定 bz 参数
|
||
# 网页实际参数: 0=注册前未读, 2=应读(历史上曾存在 1=已读,但当前逻辑不再使用)
|
||
# 当前前端选项: 注册前未读、应读(默认应读)
|
||
browse_type_text = str(browse_type or "")
|
||
if '注册前' in browse_type_text:
|
||
bz = 0 # 注册前未读
|
||
else:
|
||
bz = 2 # 应读
|
||
|
||
self.log(f"[API] 开始浏览 '{browse_type}' (bz={bz})...")
|
||
|
||
try:
|
||
total_items = 0
|
||
total_attachments = 0
|
||
page = 1
|
||
base_url = None
|
||
skipped_items = 0
|
||
skipped_pages = 0
|
||
consecutive_failures = 0
|
||
max_consecutive_failures = 3
|
||
|
||
# 获取第一页
|
||
try:
|
||
articles, total_pages, next_url = self.get_article_list_page(bz, page)
|
||
consecutive_failures = 0
|
||
except Exception as e:
|
||
result.error_message = str(e)
|
||
self.log(f"[API] 获取第1页列表失败: {str(e)}")
|
||
return result
|
||
|
||
if not articles:
|
||
self.log(f"[API] '{browse_type}' 没有待处理内容")
|
||
result.success = True
|
||
return result
|
||
|
||
self.log(f"[API] 共 {total_pages} 页,开始处理...")
|
||
|
||
if next_url:
|
||
base_url = next_url
|
||
elif total_pages > 1:
|
||
base_url = f"{BASE_URL}/admin/center.aspx?bz={bz}&page=2"
|
||
|
||
total_records = int(getattr(self, "last_total_records", 0) or 0)
|
||
last_report_ts = 0.0
|
||
|
||
def report_progress(force: bool = False):
|
||
nonlocal last_report_ts
|
||
if not progress_callback:
|
||
return
|
||
now_ts = time.time()
|
||
if not force and now_ts - last_report_ts < 1.0:
|
||
return
|
||
last_report_ts = now_ts
|
||
try:
|
||
progress_callback({"total_items": total_records, "browsed_items": total_items})
|
||
except Exception:
|
||
pass
|
||
|
||
report_progress(force=True)
|
||
|
||
# 处理所有页面
|
||
while page <= total_pages:
|
||
if should_stop_callback and should_stop_callback():
|
||
self.log("[API] 收到停止信号")
|
||
break
|
||
|
||
# page==1 已取过,后续页在这里获取
|
||
if page > 1:
|
||
try:
|
||
articles, _, next_url = self.get_article_list_page(bz, page, base_url)
|
||
consecutive_failures = 0
|
||
if next_url:
|
||
base_url = next_url
|
||
except Exception as e:
|
||
skipped_pages += 1
|
||
consecutive_failures += 1
|
||
self.log(
|
||
f"[API] 获取第{page}页列表失败,跳过本页(连续失败{consecutive_failures}/{max_consecutive_failures}): {str(e)}"
|
||
)
|
||
if consecutive_failures >= max_consecutive_failures:
|
||
raise
|
||
page += 1
|
||
continue
|
||
|
||
for article in articles:
|
||
if should_stop_callback and should_stop_callback():
|
||
break
|
||
|
||
title = article['title'][:30]
|
||
# 获取附件(文章详情页)
|
||
try:
|
||
attachments = self.get_article_attachments(article['href'])
|
||
consecutive_failures = 0
|
||
except Exception as e:
|
||
skipped_items += 1
|
||
consecutive_failures += 1
|
||
self.log(
|
||
f"[API] 获取文章失败,跳过(连续失败{consecutive_failures}/{max_consecutive_failures}): {title} | {str(e)}"
|
||
)
|
||
if consecutive_failures >= max_consecutive_failures:
|
||
raise
|
||
continue
|
||
|
||
total_items += 1
|
||
report_progress()
|
||
|
||
if attachments:
|
||
for attach in attachments:
|
||
if self.mark_read(attach['id'], attach['channel_id']):
|
||
total_attachments += 1
|
||
|
||
self.log(f"[API] [{total_items}] {title} - {len(attachments)}个附件")
|
||
|
||
time.sleep(0.1)
|
||
|
||
page += 1
|
||
time.sleep(0.2)
|
||
|
||
report_progress(force=True)
|
||
if skipped_items or skipped_pages:
|
||
self.log(
|
||
f"[API] 浏览完成: {total_items} 条内容,{total_attachments} 个附件(跳过 {skipped_items} 条内容,{skipped_pages} 页列表)"
|
||
)
|
||
else:
|
||
self.log(f"[API] 浏览完成: {total_items} 条内容,{total_attachments} 个附件")
|
||
|
||
result.success = True
|
||
result.total_items = total_items
|
||
result.total_attachments = total_attachments
|
||
return result
|
||
|
||
except Exception as e:
|
||
result.error_message = str(e)
|
||
self.log(f"[API] 浏览出错: {str(e)}")
|
||
return result
|
||
|
||
def close(self):
|
||
"""关闭会话"""
|
||
if self._closed:
|
||
return
|
||
self._closed = True
|
||
|
||
try:
|
||
self.session.close()
|
||
except:
|
||
pass
|
||
finally:
|
||
try:
|
||
_api_browser_instances.discard(self)
|
||
except Exception:
|
||
pass
|
||
|
||
def __enter__(self):
|
||
"""Context manager支持 - 进入"""
|
||
return self
|
||
|
||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||
"""Context manager支持 - 退出"""
|
||
self.close()
|
||
return False # 不抑制异常
|