🎉 项目优化与Bug修复完整版

 主要优化成果:
- 修复Unicode字符编码问题(Windows跨平台兼容性)
- 安装wkhtmltoimage,截图功能完全修复
- 智能延迟优化(api_browser.py)
- 线程池资源泄漏修复(tasks.py)
- HTML解析缓存机制
- 二分搜索算法优化(kdocs_uploader.py)
- 自适应资源配置(browser_pool_worker.py)

🐛 Bug修复:
- 解决截图失败问题
- 修复管理员密码设置
- 解决应用启动编码错误

📚 新增文档:
- BUG_REPORT.md - 完整bug分析报告
- PERFORMANCE_ANALYSIS_REPORT.md - 性能优化分析
- LINUX_DEPLOYMENT_ANALYSIS.md - Linux部署指南
- SCREENSHOT_FIX_SUCCESS.md - 截图功能修复记录
- INSTALL_WKHTMLTOIMAGE.md - 安装指南
- OPTIMIZATION_FIXES_SUMMARY.md - 优化总结

🚀 功能验证:
- Flask应用正常运行(51233端口)
- 数据库、截图线程池、API预热正常
- 管理员登录:admin/admin123
- 健康检查API:http://127.0.0.1:51233/health

💡 技术改进:
- 智能延迟算法(自适应调整)
- LRU缓存策略
- 线程池资源管理优化
- 二分搜索算法(O(log n) vs O(n))
- 自适应资源管理

🎯 项目现在稳定运行,可部署到Linux环境
This commit is contained in:
zsglpt Optimizer
2026-01-16 17:39:55 +08:00
parent 722dccdc78
commit 7e9a772104
47 changed files with 9382 additions and 749 deletions

View File

@@ -15,14 +15,78 @@ import weakref
from typing import Optional, Callable
from dataclasses import dataclass
from urllib.parse import urlsplit
import threading
from app_config import get_config
import time as _time_module
_MODULE_START_TIME = _time_module.time()
_WARMUP_PERIOD_SECONDS = 60 # 启动后 60 秒内使用更长超时
_WARMUP_TIMEOUT_SECONDS = 15.0 # 预热期间的超时时间
# HTML解析缓存类
class HTMLParseCache:
"""HTML解析结果缓存"""
def __init__(self, ttl: int = 300, maxsize: int = 1000):
self.cache = {}
self.ttl = ttl
self.maxsize = maxsize
self._access_times = {}
self._lock = threading.RLock()
def _make_key(self, url: str, content_hash: str) -> str:
return f"{url}:{content_hash}"
def get(self, key: str) -> Optional[tuple]:
"""获取缓存,如果存在且未过期"""
with self._lock:
if key in self.cache:
value, timestamp = self.cache[key]
if time.time() - timestamp < self.ttl:
self._access_times[key] = time.time()
return value
else:
# 过期删除
del self.cache[key]
del self._access_times[key]
return None
def set(self, key: str, value: tuple):
"""设置缓存"""
with self._lock:
# 如果缓存已满,删除最久未访问的项
if len(self.cache) >= self.maxsize:
if self._access_times:
# 使用简单的LRU策略删除最久未访问的项
oldest_key = None
oldest_time = float("inf")
for key, access_time in self._access_times.items():
if access_time < oldest_time:
oldest_time = access_time
oldest_key = key
if oldest_key:
del self.cache[oldest_key]
del self._access_times[oldest_key]
self.cache[key] = (value, time.time())
self._access_times[key] = time.time()
def clear(self):
"""清空缓存"""
with self._lock:
self.cache.clear()
self._access_times.clear()
def get_lru_key(self) -> Optional[str]:
"""获取最久未访问的键"""
if not self._access_times:
return None
return min(self._access_times.keys(), key=lambda k: self._access_times[k])
config = get_config()
BASE_URL = getattr(config, "ZSGL_BASE_URL", "https://postoa.aidunsoft.com")
@@ -31,7 +95,9 @@ INDEX_URL_PATTERN = getattr(config, "ZSGL_INDEX_URL_PATTERN", "index.aspx")
COOKIES_DIR = getattr(config, "COOKIES_DIR", "data/cookies")
try:
_API_REQUEST_TIMEOUT_SECONDS = float(os.environ.get("API_REQUEST_TIMEOUT_SECONDS") or os.environ.get("API_REQUEST_TIMEOUT") or "5")
_API_REQUEST_TIMEOUT_SECONDS = float(
os.environ.get("API_REQUEST_TIMEOUT_SECONDS") or os.environ.get("API_REQUEST_TIMEOUT") or "5"
)
except Exception:
_API_REQUEST_TIMEOUT_SECONDS = 5.0
_API_REQUEST_TIMEOUT_SECONDS = max(3.0, _API_REQUEST_TIMEOUT_SECONDS)
@@ -66,6 +132,7 @@ def is_cookie_jar_fresh(cookie_path: str, max_age_seconds: int = _COOKIE_JAR_MAX
except Exception:
return False
_api_browser_instances: "weakref.WeakSet[APIBrowser]" = weakref.WeakSet()
@@ -84,6 +151,7 @@ atexit.register(_cleanup_api_browser_instances)
@dataclass
class APIBrowseResult:
"""API 浏览结果"""
success: bool
total_items: int = 0
total_attachments: int = 0
@@ -95,34 +163,73 @@ class APIBrowser:
def __init__(self, log_callback: Optional[Callable] = None, proxy_config: Optional[dict] = None):
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
})
self.session.headers.update(
{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
}
)
self.logged_in = False
self.log_callback = log_callback
self.stop_flag = False
self._closed = False # 防止重复关闭
self.last_total_records = 0
# 初始化HTML解析缓存
self._parse_cache = HTMLParseCache(ttl=300, maxsize=500) # 5分钟缓存最多500条记录
# 设置代理
if proxy_config and proxy_config.get("server"):
proxy_server = proxy_config["server"]
self.session.proxies = {
"http": proxy_server,
"https": proxy_server
}
self.session.proxies = {"http": proxy_server, "https": proxy_server}
self.proxy_server = proxy_server
else:
self.proxy_server = None
_api_browser_instances.add(self)
def _calculate_adaptive_delay(self, iteration: int, consecutive_failures: int) -> float:
"""
智能延迟计算:文章处理延迟
根据迭代次数和连续失败次数动态调整延迟
"""
# 基础延迟,显著降低
base_delay = 0.03
# 如果有连续失败,增加延迟但有上限
if consecutive_failures > 0:
delay = base_delay * (1.5 ** min(consecutive_failures, 3))
return min(delay, 0.2) # 最多200ms
# 根据处理进度调整延迟,开始时较慢,后来可以更快
progress_factor = min(iteration / 100.0, 1.0) # 100个文章后达到最大优化
optimized_delay = base_delay * (1.2 - 0.4 * progress_factor) # 从120%逐渐降低到80%
return max(optimized_delay, 0.02) # 最少20ms
def _calculate_page_delay(self, current_page: int, new_articles_in_page: int) -> float:
"""
智能延迟计算:页面处理延迟
根据页面位置和新文章数量调整延迟
"""
base_delay = 0.08 # 基础延迟降低50%
# 如果当前页有大量新文章,可以稍微增加延迟
if new_articles_in_page > 10:
return base_delay * 1.2
# 如果是新页面,降低延迟(内容可能需要加载)
if current_page <= 3:
return base_delay * 1.1
# 后续页面可以更快
return base_delay * 0.8
def log(self, message: str):
"""记录日志"""
if self.log_callback:
self.log_callback(message)
def save_cookies_for_screenshot(self, username: str):
"""保存 cookies 供 wkhtmltoimage 使用Netscape Cookie 格式)"""
cookies_path = get_cookie_jar_path(username)
@@ -160,24 +267,22 @@ class APIBrowser:
self.log(f"[API] 保存cookies失败: {e}")
return False
def _request_with_retry(self, method, url, max_retries=3, retry_delay=1, **kwargs):
"""带重试机制的请求方法"""
# 启动后 60 秒内使用更长超时15秒之后使用配置的超时
if (_time_module.time() - _MODULE_START_TIME) < _WARMUP_PERIOD_SECONDS:
kwargs.setdefault('timeout', _WARMUP_TIMEOUT_SECONDS)
kwargs.setdefault("timeout", _WARMUP_TIMEOUT_SECONDS)
else:
kwargs.setdefault('timeout', _API_REQUEST_TIMEOUT_SECONDS)
kwargs.setdefault("timeout", _API_REQUEST_TIMEOUT_SECONDS)
last_error = None
timeout_value = kwargs.get("timeout")
diag_enabled = _API_DIAGNOSTIC_LOG
slow_ms = _API_DIAGNOSTIC_SLOW_MS
for attempt in range(1, max_retries + 1):
start_ts = _time_module.time()
try:
if method.lower() == 'get':
if method.lower() == "get":
resp = self.session.get(url, **kwargs)
else:
resp = self.session.post(url, **kwargs)
@@ -198,19 +303,20 @@ class APIBrowser:
if attempt < max_retries:
self.log(f"[API] 请求超时,{retry_delay}秒后重试 ({attempt}/{max_retries})...")
import time
time.sleep(retry_delay)
else:
self.log(f"[API] 请求失败,已重试{max_retries}次: {str(e)}")
raise last_error
def _get_aspnet_fields(self, soup):
"""获取 ASP.NET 隐藏字段"""
fields = {}
for name in ['__VIEWSTATE', '__VIEWSTATEGENERATOR', '__EVENTVALIDATION']:
field = soup.find('input', {'name': name})
for name in ["__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION"]:
field = soup.find("input", {"name": name})
if field:
fields[name] = field.get('value', '')
fields[name] = field.get("value", "")
return fields
def get_real_name(self) -> Optional[str]:
@@ -224,18 +330,18 @@ class APIBrowser:
try:
url = f"{BASE_URL}/admin/center.aspx"
resp = self._request_with_retry('get', url)
soup = BeautifulSoup(resp.text, 'html.parser')
resp = self._request_with_retry("get", url)
soup = BeautifulSoup(resp.text, "html.parser")
# 查找包含"姓名:"的元素
# 页面格式: <li><p>姓名:喻勇祥(19174616018) 人力资源编码: ...</p></li>
nlist = soup.find('div', {'class': 'nlist-5'})
nlist = soup.find("div", {"class": "nlist-5"})
if nlist:
first_li = nlist.find('li')
first_li = nlist.find("li")
if first_li:
text = first_li.get_text()
# 解析姓名:格式为 "姓名XXX(手机号)"
match = re.search(r'姓名[:]\s*([^\(]+)', text)
match = re.search(r"姓名[:]\s*([^\(]+)", text)
if match:
real_name = match.group(1).strip()
if real_name:
@@ -249,26 +355,26 @@ class APIBrowser:
self.log(f"[API] 登录: {username}")
try:
resp = self._request_with_retry('get', LOGIN_URL)
resp = self._request_with_retry("get", LOGIN_URL)
soup = BeautifulSoup(resp.text, 'html.parser')
soup = BeautifulSoup(resp.text, "html.parser")
fields = self._get_aspnet_fields(soup)
data = fields.copy()
data['txtUserName'] = username
data['txtPassword'] = password
data['btnSubmit'] = '登 录'
data["txtUserName"] = username
data["txtPassword"] = password
data["btnSubmit"] = "登 录"
resp = self._request_with_retry(
'post',
"post",
LOGIN_URL,
data=data,
headers={
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': BASE_URL,
'Referer': LOGIN_URL,
"Content-Type": "application/x-www-form-urlencoded",
"Origin": BASE_URL,
"Referer": LOGIN_URL,
},
allow_redirects=True
allow_redirects=True,
)
if INDEX_URL_PATTERN in resp.url:
@@ -276,9 +382,9 @@ class APIBrowser:
self.log(f"[API] 登录成功")
return True
else:
soup = BeautifulSoup(resp.text, 'html.parser')
error = soup.find(id='lblMsg')
error_msg = error.get_text().strip() if error else '未知错误'
soup = BeautifulSoup(resp.text, "html.parser")
error = soup.find(id="lblMsg")
error_msg = error.get_text().strip() if error else "未知错误"
self.log(f"[API] 登录失败: {error_msg}")
return False
@@ -292,55 +398,57 @@ class APIBrowser:
return [], 0, None
if base_url and page > 1:
url = re.sub(r'page=\d+', f'page={page}', base_url)
url = re.sub(r"page=\d+", f"page={page}", base_url)
elif page > 1:
# 兼容兜底:若没有 next_url极少数情况下页面不提供“下一页”链接尝试直接拼 page 参数
url = f"{BASE_URL}/admin/center.aspx?bz={bz}&page={page}"
else:
url = f"{BASE_URL}/admin/center.aspx?bz={bz}"
resp = self._request_with_retry('get', url)
soup = BeautifulSoup(resp.text, 'html.parser')
resp = self._request_with_retry("get", url)
soup = BeautifulSoup(resp.text, "html.parser")
articles = []
ltable = soup.find('table', {'class': 'ltable'})
ltable = soup.find("table", {"class": "ltable"})
if ltable:
rows = ltable.find_all('tr')[1:]
rows = ltable.find_all("tr")[1:]
for row in rows:
# 检查是否是"暂无记录"
if '暂无记录' in row.get_text():
if "暂无记录" in row.get_text():
continue
link = row.find('a', href=True)
link = row.find("a", href=True)
if link:
href = link.get('href', '')
href = link.get("href", "")
title = link.get_text().strip()
match = re.search(r'id=(\d+)', href)
match = re.search(r"id=(\d+)", href)
article_id = match.group(1) if match else None
articles.append({
'title': title,
'href': href,
'article_id': article_id,
})
articles.append(
{
"title": title,
"href": href,
"article_id": article_id,
}
)
# 获取总页数
total_pages = 1
next_page_url = None
total_records = 0
page_content = soup.find(id='PageContent')
page_content = soup.find(id="PageContent")
if page_content:
text = page_content.get_text()
total_match = re.search(r'共(\d+)记录', text)
total_match = re.search(r"共(\d+)记录", text)
if total_match:
total_records = int(total_match.group(1))
total_pages = (total_records + 9) // 10
next_link = page_content.find('a', string=re.compile('下一页'))
next_link = page_content.find("a", string=re.compile("下一页"))
if next_link:
next_href = next_link.get('href', '')
next_href = next_link.get("href", "")
if next_href:
next_page_url = f"{BASE_URL}/admin/{next_href}"
@@ -351,56 +459,55 @@ class APIBrowser:
return articles, total_pages, next_page_url
def get_article_attachments(self, article_href: str):
"""
获取文章的附件列表和文章信息
Returns:
tuple: (attachments_list, article_info)
- attachments_list: 附件列表
- article_info: 包含 channel_id 和 article_id 的字典,用于标记文章已读
"""
if not article_href.startswith('http'):
"""获取文章的附件列表和文章信息"""
if not article_href.startswith("http"):
url = f"{BASE_URL}/admin/{article_href}"
else:
url = article_href
resp = self._request_with_retry('get', url)
soup = BeautifulSoup(resp.text, 'html.parser')
# 先检查缓存,避免不必要的请求
# 使用URL作为缓存键简化版本
cache_key = f"attachments_{hash(url)}"
cached_result = self._parse_cache.get(cache_key)
if cached_result:
return cached_result
resp = self._request_with_retry("get", url)
soup = BeautifulSoup(resp.text, "html.parser")
attachments = []
article_info = {'channel_id': None, 'article_id': None}
article_info = {"channel_id": None, "article_id": None}
# 从 saveread 按钮获取 channel_id 和 article_id
for elem in soup.find_all(['button', 'input']):
onclick = elem.get('onclick', '')
match = re.search(r'saveread\((\d+),(\d+)\)', onclick)
for elem in soup.find_all(["button", "input"]):
onclick = elem.get("onclick", "")
match = re.search(r"saveread\((\d+),(\d+)\)", onclick)
if match:
article_info['channel_id'] = match.group(1)
article_info['article_id'] = match.group(2)
article_info["channel_id"] = match.group(1)
article_info["article_id"] = match.group(2)
break
attach_list = soup.find('div', {'class': 'attach-list2'})
attach_list = soup.find("div", {"class": "attach-list2"})
if attach_list:
items = attach_list.find_all('li')
items = attach_list.find_all("li")
for item in items:
download_links = item.find_all('a', onclick=re.compile(r'download2?\.ashx'))
download_links = item.find_all("a", onclick=re.compile(r"download2?\.ashx"))
for link in download_links:
onclick = link.get('onclick', '')
id_match = re.search(r'id=(\d+)', onclick)
channel_match = re.search(r'channel_id=(\d+)', onclick)
onclick = link.get("onclick", "")
id_match = re.search(r"id=(\d+)", onclick)
channel_match = re.search(r"channel_id=(\d+)", onclick)
if id_match:
attach_id = id_match.group(1)
channel_id = channel_match.group(1) if channel_match else '1'
h3 = item.find('h3')
filename = h3.get_text().strip() if h3 else f'附件{attach_id}'
attachments.append({
'id': attach_id,
'channel_id': channel_id,
'filename': filename
})
channel_id = channel_match.group(1) if channel_match else "1"
h3 = item.find("h3")
filename = h3.get_text().strip() if h3 else f"附件{attach_id}"
attachments.append({"id": attach_id, "channel_id": channel_id, "filename": filename})
break
return attachments, article_info
result = (attachments, article_info)
# 存入缓存
self._parse_cache.set(cache_key, result)
return result
def mark_article_read(self, channel_id: str, article_id: str) -> bool:
"""通过 saveread API 标记文章已读"""
@@ -408,7 +515,10 @@ class APIBrowser:
return False
import random
saveread_url = f"{BASE_URL}/tools/submit_ajax.ashx?action=saveread&time={random.random()}&fl={channel_id}&id={article_id}"
saveread_url = (
f"{BASE_URL}/tools/submit_ajax.ashx?action=saveread&time={random.random()}&fl={channel_id}&id={article_id}"
)
try:
resp = self._request_with_retry("post", saveread_url)
@@ -416,14 +526,14 @@ class APIBrowser:
if resp.status_code == 200:
try:
data = resp.json()
return data.get('status') == 1
return data.get("status") == 1
except:
return True # 如果不是 JSON 但状态码 200也认为成功
return False
except:
return False
def mark_read(self, attach_id: str, channel_id: str = '1') -> bool:
def mark_read(self, attach_id: str, channel_id: str = "1") -> bool:
"""通过访问预览通道标记附件已读"""
download_url = f"{BASE_URL}/tools/download2.ashx?site=main&id={attach_id}&channel_id={channel_id}"
@@ -461,7 +571,7 @@ class APIBrowser:
# 网站更新后参数: 0=应读, 1=已读(注册前未读需通过页面交互切换)
# 当前前端选项: 注册前未读、应读(默认应读)
browse_type_text = str(browse_type or "")
if '注册前' in browse_type_text:
if "注册前" in browse_type_text:
bz = 0 # 注册前未读(暂与应读相同,网站通过页面状态区分)
else:
bz = 0 # 应读
@@ -528,14 +638,14 @@ class APIBrowser:
if should_stop_callback and should_stop_callback():
break
article_href = article['href']
article_href = article["href"]
# 跳过已处理的文章
if article_href in processed_hrefs:
continue
processed_hrefs.add(article_href)
new_articles_in_page += 1
title = article['title'][:30]
title = article["title"][:30]
# 获取附件和文章信息(文章详情页)
try:
@@ -556,16 +666,13 @@ class APIBrowser:
# 标记文章已读(调用 saveread API
article_marked = False
if article_info.get('channel_id') and article_info.get('article_id'):
article_marked = self.mark_article_read(
article_info['channel_id'],
article_info['article_id']
)
if article_info.get("channel_id") and article_info.get("article_id"):
article_marked = self.mark_article_read(article_info["channel_id"], article_info["article_id"])
# 处理附件(如果有)
if attachments:
for attach in attachments:
if self.mark_read(attach['id'], attach['channel_id']):
if self.mark_read(attach["id"], attach["channel_id"]):
total_attachments += 1
self.log(f"[API] [{total_items}] {title} - {len(attachments)}个附件")
@@ -574,9 +681,10 @@ class APIBrowser:
status = "已标记" if article_marked else "标记失败"
self.log(f"[API] [{total_items}] {title} - 无附件({status})")
time.sleep(0.1)
# 智能延迟策略:根据连续失败次数和文章数量动态调整
time.sleep(self._calculate_adaptive_delay(total_items, consecutive_failures))
time.sleep(0.2)
time.sleep(self._calculate_page_delay(current_page, new_articles_in_page))
# 决定下一步获取哪一页
if new_articles_in_page > 0:
@@ -599,7 +707,9 @@ class APIBrowser:
report_progress(force=True)
if skipped_items:
self.log(f"[API] 浏览完成: {total_items} 条内容,{total_attachments} 个附件(跳过 {skipped_items} 条内容)")
self.log(
f"[API] 浏览完成: {total_items} 条内容,{total_attachments} 个附件(跳过 {skipped_items} 条内容)"
)
else:
self.log(f"[API] 浏览完成: {total_items} 条内容,{total_attachments} 个附件")
@@ -656,7 +766,7 @@ def warmup_api_connection(proxy_config: Optional[dict] = None, log_callback: Opt
# 发送一个轻量级请求建立连接
resp = session.get(f"{BASE_URL}/admin/login.aspx", timeout=10, allow_redirects=False)
log(f" API 连接预热完成 (status={resp.status_code})")
log(f"[OK] API 连接预热完成 (status={resp.status_code})")
session.close()
return True
except Exception as e: