feat: 知识管理平台精简版 - PyQt6桌面应用

主要功能:
- 账号管理:添加/编辑/删除账号,测试登录
- 浏览任务:批量浏览应读/选读内容并标记已读
- 截图管理:wkhtmltoimage截图,查看历史
- 金山文档:扫码登录/微信快捷登录,自动上传截图

技术栈:
- PyQt6 GUI框架
- Playwright 浏览器自动化
- SQLite 本地数据存储
- wkhtmltoimage 网页截图

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-18 22:16:36 +08:00
commit 83fef6dff2
24 changed files with 6133 additions and 0 deletions

13
core/__init__.py Normal file
View File

@@ -0,0 +1,13 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""核心业务逻辑模块"""
from .api_browser import APIBrowser, APIBrowseResult, get_cookie_jar_path
from .screenshot import take_screenshot, ScreenshotResult
from .kdocs_uploader import KDocsUploader
__all__ = [
'APIBrowser', 'APIBrowseResult', 'get_cookie_jar_path',
'take_screenshot', 'ScreenshotResult',
'KDocsUploader'
]

504
core/api_browser.py Normal file
View File

@@ -0,0 +1,504 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
API浏览器 - 精简版
用纯HTTP请求实现浏览功能比浏览器自动化快30-60倍
从原项目精简提取,移除了缓存、诊断日志等复杂功能
"""
import os
import re
import time
import hashlib
from typing import Optional, Callable, List, Dict, Any
from dataclasses import dataclass
from urllib.parse import urlsplit
import requests
from bs4 import BeautifulSoup
@dataclass
class APIBrowseResult:
"""API浏览结果"""
success: bool
total_items: int = 0
total_attachments: int = 0
error_message: str = ""
def get_cookie_jar_path(username: str) -> str:
"""获取截图用的cookies文件路径Netscape Cookie格式"""
from config import COOKIES_DIR
COOKIES_DIR.mkdir(exist_ok=True)
filename = hashlib.sha256(username.encode()).hexdigest()[:32] + ".cookies.txt"
return str(COOKIES_DIR / filename)
def is_cookie_jar_fresh(cookie_path: str, max_age_seconds: int = 86400) -> bool:
"""判断cookies文件是否存在且未过期默认24小时"""
if not cookie_path or not os.path.exists(cookie_path):
return False
try:
file_age = time.time() - os.path.getmtime(cookie_path)
return file_age <= max(0, int(max_age_seconds or 0))
except Exception:
return False
class APIBrowser:
"""
API浏览器 - 使用纯HTTP请求实现浏览
用法:
with APIBrowser(log_callback=print) as browser:
if browser.login(username, password):
result = browser.browse_content("应读")
"""
def __init__(self, log_callback: Optional[Callable] = None, proxy_config: Optional[dict] = None):
self.session = requests.Session()
self.session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
})
self.logged_in = False
self.log_callback = log_callback
self.stop_flag = False
self._closed = False
self.last_total_records = 0
self._username = ""
# 获取配置
from config import get_config
config = get_config()
self.base_url = config.zsgl.base_url
self.login_url = config.zsgl.login_url
self.index_url_pattern = config.zsgl.index_url_pattern
# 设置代理
if proxy_config and proxy_config.get("server"):
proxy_server = proxy_config["server"]
self.session.proxies = {"http": proxy_server, "https": proxy_server}
self.proxy_server = proxy_server
else:
self.proxy_server = None
def log(self, message: str):
"""记录日志"""
if self.log_callback:
self.log_callback(message)
def _request_with_retry(self, method: str, url: str, max_retries: int = 3,
retry_delay: float = 1, **kwargs) -> requests.Response:
"""带重试机制的请求方法"""
kwargs.setdefault("timeout", 10.0)
last_error = None
for attempt in range(1, max_retries + 1):
try:
if method.lower() == "get":
resp = self.session.get(url, **kwargs)
else:
resp = self.session.post(url, **kwargs)
return resp
except Exception as e:
last_error = e
if attempt < max_retries:
self.log(f"[API] 请求超时,{retry_delay}秒后重试 ({attempt}/{max_retries})...")
time.sleep(retry_delay)
else:
self.log(f"[API] 请求失败,已重试{max_retries}次: {str(e)}")
raise last_error
def _get_aspnet_fields(self, soup: BeautifulSoup) -> Dict[str, str]:
"""获取ASP.NET隐藏字段"""
fields = {}
for name in ["__VIEWSTATE", "__VIEWSTATEGENERATOR", "__EVENTVALIDATION"]:
field = soup.find("input", {"name": name})
if field:
fields[name] = field.get("value", "")
return fields
def login(self, username: str, password: str) -> bool:
"""登录"""
self.log(f"[API] 登录: {username}")
self._username = username
try:
resp = self._request_with_retry("get", self.login_url)
soup = BeautifulSoup(resp.text, "html.parser")
fields = self._get_aspnet_fields(soup)
data = fields.copy()
data["txtUserName"] = username
data["txtPassword"] = password
data["btnSubmit"] = "登 录"
resp = self._request_with_retry(
"post",
self.login_url,
data=data,
headers={
"Content-Type": "application/x-www-form-urlencoded",
"Origin": self.base_url,
"Referer": self.login_url,
},
allow_redirects=True,
)
if self.index_url_pattern in resp.url:
self.logged_in = True
self.log(f"[API] 登录成功")
return True
else:
soup = BeautifulSoup(resp.text, "html.parser")
error = soup.find(id="lblMsg")
error_msg = error.get_text().strip() if error else "未知错误"
self.log(f"[API] 登录失败: {error_msg}")
return False
except Exception as e:
self.log(f"[API] 登录异常: {str(e)}")
return False
def get_real_name(self) -> Optional[str]:
"""获取用户真实姓名"""
if not self.logged_in:
return None
try:
url = f"{self.base_url}/admin/center.aspx"
resp = self._request_with_retry("get", url)
soup = BeautifulSoup(resp.text, "html.parser")
nlist = soup.find("div", {"class": "nlist-5"})
if nlist:
first_li = nlist.find("li")
if first_li:
text = first_li.get_text()
match = re.search(r"姓名[:]\s*([^\(]+)", text)
if match:
return match.group(1).strip()
return None
except Exception:
return None
def save_cookies_for_screenshot(self, username: str) -> bool:
"""保存cookies供wkhtmltoimage使用Netscape Cookie格式"""
cookies_path = get_cookie_jar_path(username)
try:
parsed = urlsplit(self.base_url)
cookie_domain = parsed.hostname or "postoa.aidunsoft.com"
lines = [
"# Netscape HTTP Cookie File",
"# Generated by zsglpt-lite",
]
for cookie in self.session.cookies:
domain = cookie.domain or cookie_domain
include_subdomains = "TRUE" if domain.startswith(".") else "FALSE"
path = cookie.path or "/"
secure = "TRUE" if getattr(cookie, "secure", False) else "FALSE"
expires = int(getattr(cookie, "expires", 0) or 0)
lines.append("\t".join([
domain,
include_subdomains,
path,
secure,
str(expires),
cookie.name,
cookie.value,
]))
with open(cookies_path, "w", encoding="utf-8") as f:
f.write("\n".join(lines) + "\n")
self.log(f"[API] Cookies已保存供截图使用")
return True
except Exception as e:
self.log(f"[API] 保存cookies失败: {e}")
return False
def get_article_list_page(self, bz: int = 0, page: int = 1) -> tuple:
"""获取单页文章列表"""
if not self.logged_in:
return [], 0, None
if page > 1:
url = f"{self.base_url}/admin/center.aspx?bz={bz}&page={page}"
else:
url = f"{self.base_url}/admin/center.aspx?bz={bz}"
resp = self._request_with_retry("get", url)
soup = BeautifulSoup(resp.text, "html.parser")
articles = []
ltable = soup.find("table", {"class": "ltable"})
if ltable:
rows = ltable.find_all("tr")[1:]
for row in rows:
if "暂无记录" in row.get_text():
continue
link = row.find("a", href=True)
if link:
href = link.get("href", "")
title = link.get_text().strip()
match = re.search(r"id=(\d+)", href)
article_id = match.group(1) if match else None
articles.append({
"title": title,
"href": href,
"article_id": article_id,
})
# 获取总页数
total_pages = 1
total_records = 0
page_content = soup.find(id="PageContent")
if page_content:
text = page_content.get_text()
total_match = re.search(r"共(\d+)记录", text)
if total_match:
total_records = int(total_match.group(1))
total_pages = (total_records + 9) // 10
self.last_total_records = total_records
return articles, total_pages, None
def get_article_attachments(self, article_href: str) -> tuple:
"""获取文章的附件列表和文章信息"""
if not article_href.startswith("http"):
url = f"{self.base_url}/admin/{article_href}"
else:
url = article_href
resp = self._request_with_retry("get", url)
soup = BeautifulSoup(resp.text, "html.parser")
attachments = []
article_info = {"channel_id": None, "article_id": None}
# 从saveread按钮获取channel_id和article_id
for elem in soup.find_all(["button", "input"]):
onclick = elem.get("onclick", "")
match = re.search(r"saveread\((\d+),(\d+)\)", onclick)
if match:
article_info["channel_id"] = match.group(1)
article_info["article_id"] = match.group(2)
break
attach_list = soup.find("div", {"class": "attach-list2"})
if attach_list:
items = attach_list.find_all("li")
for item in items:
download_links = item.find_all("a", onclick=re.compile(r"download2?\.ashx"))
for link in download_links:
onclick = link.get("onclick", "")
id_match = re.search(r"id=(\d+)", onclick)
channel_match = re.search(r"channel_id=(\d+)", onclick)
if id_match:
attach_id = id_match.group(1)
channel_id = channel_match.group(1) if channel_match else "1"
h3 = item.find("h3")
filename = h3.get_text().strip() if h3 else f"附件{attach_id}"
attachments.append({
"id": attach_id,
"channel_id": channel_id,
"filename": filename
})
break
return attachments, article_info
def mark_article_read(self, channel_id: str, article_id: str) -> bool:
"""通过saveread API标记文章已读"""
if not channel_id or not article_id:
return False
import random
saveread_url = (
f"{self.base_url}/tools/submit_ajax.ashx?action=saveread"
f"&time={random.random()}&fl={channel_id}&id={article_id}"
)
try:
resp = self._request_with_retry("post", saveread_url)
if resp.status_code == 200:
try:
data = resp.json()
return data.get("status") == 1
except:
return True
return False
except:
return False
def mark_attachment_read(self, attach_id: str, channel_id: str = "1") -> bool:
"""通过访问预览通道标记附件已读"""
download_url = f"{self.base_url}/tools/download2.ashx?site=main&id={attach_id}&channel_id={channel_id}"
try:
resp = self._request_with_retry("get", download_url, stream=True)
resp.close()
return resp.status_code == 200
except:
return False
def browse_content(
self,
browse_type: str,
should_stop_callback: Optional[Callable] = None,
progress_callback: Optional[Callable] = None,
) -> APIBrowseResult:
"""
浏览内容并标记已读
Args:
browse_type: 浏览类型 (应读/注册前未读)
should_stop_callback: 检查是否应该停止的回调函数
progress_callback: 进度回调,用于实时上报已浏览内容数量
回调参数: {"total_items": int, "browsed_items": int}
Returns:
浏览结果
"""
result = APIBrowseResult(success=False)
if not self.logged_in:
result.error_message = "未登录"
return result
# 根据浏览类型确定bz参数网站更新后 bz=0 为应读)
bz = 0
self.log(f"[API] 开始浏览 '{browse_type}' (bz={bz})...")
try:
total_items = 0
total_attachments = 0
# 获取第一页
articles, total_pages, _ = self.get_article_list_page(bz, 1)
if not articles:
self.log(f"[API] '{browse_type}' 没有待处理内容")
result.success = True
return result
total_records = self.last_total_records
self.log(f"[API] 共 {total_records} 条记录,开始处理...")
# 上报初始进度
if progress_callback:
progress_callback({"total_items": total_records, "browsed_items": 0})
processed_hrefs = set()
current_page = 1
max_iterations = total_records + 20
for iteration in range(max_iterations):
if should_stop_callback and should_stop_callback():
self.log("[API] 收到停止信号")
break
if not articles:
break
new_articles_in_page = 0
for article in articles:
if should_stop_callback and should_stop_callback():
break
article_href = article["href"]
if article_href in processed_hrefs:
continue
processed_hrefs.add(article_href)
new_articles_in_page += 1
title = article["title"][:30]
# 获取附件和文章信息
try:
attachments, article_info = self.get_article_attachments(article_href)
except Exception as e:
self.log(f"[API] 获取文章失败: {title} | {str(e)}")
continue
total_items += 1
# 标记文章已读
article_marked = False
if article_info.get("channel_id") and article_info.get("article_id"):
article_marked = self.mark_article_read(
article_info["channel_id"],
article_info["article_id"]
)
# 处理附件
if attachments:
for attach in attachments:
if self.mark_attachment_read(attach["id"], attach["channel_id"]):
total_attachments += 1
self.log(f"[API] [{total_items}] {title} - {len(attachments)}个附件")
else:
status = "已标记" if article_marked else "标记失败"
self.log(f"[API] [{total_items}] {title} - 无附件({status})")
# 上报进度
if progress_callback:
progress_callback({"total_items": total_records, "browsed_items": total_items})
# 简单延迟,避免请求太快
time.sleep(0.05)
# 决定下一步
if new_articles_in_page > 0:
current_page = 1
else:
current_page += 1
if current_page > total_pages:
break
# 获取下一页
try:
articles, new_total_pages, _ = self.get_article_list_page(bz, current_page)
if new_total_pages > 0:
total_pages = new_total_pages
except Exception as e:
self.log(f"[API] 获取第{current_page}页列表失败: {str(e)}")
break
self.log(f"[API] 浏览完成: {total_items} 条内容,{total_attachments} 个附件")
result.success = True
result.total_items = total_items
result.total_attachments = total_attachments
return result
except Exception as e:
result.error_message = str(e)
self.log(f"[API] 浏览出错: {str(e)}")
return result
def close(self):
"""关闭会话"""
if self._closed:
return
self._closed = True
try:
self.session.close()
except:
pass
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
return False

823
core/kdocs_uploader.py Normal file
View File

@@ -0,0 +1,823 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
金山文档上传模块 - 精简版
使用Playwright自动化上传截图到金山文档表格
移除了队列、并发控制,改为单任务顺序执行
"""
import base64
import os
import re
import time
from io import BytesIO
from typing import Any, Dict, Optional, Callable
from urllib.parse import urlparse
try:
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
except ImportError:
sync_playwright = None
class PlaywrightTimeoutError(Exception):
pass
class KDocsUploader:
"""金山文档上传器"""
def __init__(self, log_callback: Optional[Callable] = None):
self._playwright = None
self._browser = None
self._context = None
self._page = None
self._doc_url: Optional[str] = None
self._last_error: Optional[str] = None
self._logged_in = False
self._log_callback = log_callback
def log(self, msg: str):
"""记录日志"""
if self._log_callback:
self._log_callback(msg)
def _ensure_playwright(self, use_storage_state: bool = True) -> bool:
"""确保Playwright已启动"""
if sync_playwright is None:
self._last_error = "playwright 未安装"
return False
try:
from config import KDOCS_LOGIN_STATE_FILE
if self._playwright is None:
self._playwright = sync_playwright().start()
if self._browser is None:
# 调试模式:有头模式,方便查看浏览器行为
# 生产环境改回 "true"
headless = os.environ.get("KDOCS_HEADLESS", "false").lower() != "false"
# 使用系统安装的Chrome浏览器支持微信快捷登录
# channel='chrome' 会使用系统Chrome而不是Playwright自带的Chromium
chrome_args = [
"--disable-blink-features=AutomationControlled", # 隐藏自动化特征
"--disable-features=DialMediaRouteProvider", # 禁用本地网络发现提示
"--allow-running-insecure-content",
]
try:
self._browser = self._playwright.chromium.launch(
headless=headless,
channel='chrome', # 使用系统Chrome
args=chrome_args
)
self.log("[KDocs] 使用系统Chrome浏览器")
except Exception as e:
# 如果系统没有Chrome回退到Chromium
self.log(f"[KDocs] 系统Chrome不可用({e})使用Chromium")
self._browser = self._playwright.chromium.launch(headless=headless, args=chrome_args)
if self._context is None:
storage_state = str(KDOCS_LOGIN_STATE_FILE)
# 创建context时的通用配置
context_options = {
"permissions": ["clipboard-read", "clipboard-write"], # 剪贴板权限
"ignore_https_errors": True,
}
if use_storage_state and os.path.exists(storage_state):
context_options["storage_state"] = storage_state
self._context = self._browser.new_context(**context_options)
# 授予本地网络访问权限(用于微信快捷登录检测)
try:
self._context.grant_permissions(
["clipboard-read", "clipboard-write"],
origin="https://account.wps.cn"
)
except Exception:
pass
if self._page is None or self._page.is_closed():
self._page = self._context.new_page()
self._page.set_default_timeout(60000)
return True
except Exception as e:
self._last_error = f"浏览器启动失败: {e}"
self._cleanup_browser()
return False
def _cleanup_browser(self):
"""清理浏览器资源"""
for attr in ['_page', '_context', '_browser', '_playwright']:
obj = getattr(self, attr, None)
if obj:
try:
if hasattr(obj, 'close'):
obj.close()
elif hasattr(obj, 'stop'):
obj.stop()
except Exception:
pass
setattr(self, attr, None)
def _open_document(self, doc_url: str) -> bool:
"""打开金山文档"""
try:
self._doc_url = doc_url
self._ensure_clipboard_permissions(doc_url)
self._page.goto(doc_url, wait_until="domcontentloaded", timeout=30000)
time.sleep(3) # 等待页面完全加载,包括登录按钮
return True
except Exception as e:
self._last_error = f"打开文档失败: {e}"
return False
def _ensure_clipboard_permissions(self, doc_url: str):
"""授予剪贴板权限"""
if not self._context or not doc_url:
return
try:
parsed = urlparse(doc_url)
if not parsed.scheme or not parsed.netloc:
return
origin = f"{parsed.scheme}://{parsed.netloc}"
self._context.grant_permissions(["clipboard-read", "clipboard-write"], origin=origin)
except Exception:
pass
def _is_login_url(self, url: str) -> bool:
"""检查是否是登录页面"""
if not url:
return False
lower = url.lower()
if "account.wps.cn" in lower or "passport" in lower:
return True
if "login" in lower and "kdocs.cn" not in lower:
return True
return False
def _page_has_login_gate(self, page) -> bool:
"""检查页面是否需要登录"""
url = getattr(page, "url", "") or ""
# 如果URL已经是文档页面说明已登录成功
if "kdocs.cn/l/" in url or "www.kdocs.cn/l/" in url:
# 但可能有邀请对话框,先尝试点击关闭
try:
join_btn = page.get_by_role("button", name="登录并加入编辑")
if join_btn.count() > 0 and join_btn.first.is_visible(timeout=500):
self.log("[KDocs] 点击加入编辑按钮")
join_btn.first.click()
time.sleep(1)
except Exception:
pass
# 已经在文档页面,算作已登录
return False
# 检查是否在登录页面
if self._is_login_url(url):
self.log(f"[KDocs] 检测到登录页面URL: {url}")
return True
# 只检查登录页面上的登录按钮(排除文档页面的邀请对话框)
login_buttons = ["立即登录", "去登录"]
for text in login_buttons:
try:
btn = page.get_by_role("button", name=text)
if btn.count() > 0 and btn.first.is_visible(timeout=500):
self.log(f"[KDocs] 检测到登录按钮: {text}")
return True
except Exception:
pass
# 检查是否有二维码元素可见(说明还在等待扫码)
try:
qr_selectors = ["canvas", "img[class*='qr']", "div[class*='qrcode']"]
for selector in qr_selectors:
qr = page.locator(selector)
if qr.count() > 0:
for i in range(min(qr.count(), 3)):
el = qr.nth(i)
try:
if el.is_visible(timeout=200):
box = el.bounding_box()
if box and 80 <= box.get("width", 0) <= 400:
self.log(f"[KDocs] 检测到二维码元素: {selector}")
return True
except Exception:
pass
except Exception:
pass
return False
def _is_logged_in(self) -> bool:
"""检查是否已登录"""
if not self._page or self._page.is_closed():
return False
return not self._page_has_login_gate(self._page)
def _save_login_state(self):
"""保存登录状态"""
try:
from config import KDOCS_LOGIN_STATE_FILE
storage_state = str(KDOCS_LOGIN_STATE_FILE)
KDOCS_LOGIN_STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
self._context.storage_state(path=storage_state)
self.log("[KDocs] 登录状态已保存")
except Exception as e:
self.log(f"[KDocs] 保存登录状态失败: {e}")
def _ensure_login_dialog(self, use_quick_login: bool = False):
"""确保打开登录对话框
Args:
use_quick_login: 是否尝试使用微信快捷登录
"""
agree_names = ["同意", "同意并继续", "我同意", "确定", "确认"]
# 循环处理登录流程
max_clicks = 8
for round_num in range(max_clicks):
clicked = False
current_url = self._page.url
# 检查是否已经到达文档页面(登录成功)
# 需要确保不是临时跳转,等待页面稳定
if "kdocs.cn/l/" in current_url or "www.kdocs.cn/l/" in current_url:
time.sleep(1) # 等待页面稳定
stable_url = self._page.url
if "kdocs.cn/l/" in stable_url and "account.wps.cn" not in stable_url:
self.log("[KDocs] 已到达文档页面,登录成功")
return
# 1. 先检查是否有隐私协议同意按钮
for name in agree_names:
try:
btn = self._page.get_by_role("button", name=name)
if btn.count() > 0 and btn.first.is_visible(timeout=300):
self.log(f"[KDocs] 点击同意按钮: {name}")
btn.first.click()
time.sleep(1)
clicked = True
break
except Exception:
pass
if clicked:
continue
# 2. 如果启用快捷登录且在登录页面(account.wps.cn),尝试点击"微信快捷登录"
if use_quick_login and "account.wps.cn" in current_url:
try:
quick_login = self._page.get_by_text("微信快捷登录", exact=False)
if quick_login.count() > 0 and quick_login.first.is_visible(timeout=500):
self.log("[KDocs] 点击微信快捷登录")
quick_login.first.click()
time.sleep(3) # 等待快捷登录处理
# 检查是否登录成功
if "kdocs.cn/l/" in self._page.url:
self.log("[KDocs] 微信快捷登录成功")
return
clicked = True
continue
except Exception:
pass
# 3. 点击"立即登录"进入登录页面
try:
btn = self._page.get_by_role("button", name="立即登录")
if btn.count() > 0 and btn.first.is_visible(timeout=500):
self.log("[KDocs] 点击立即登录")
btn.first.click()
time.sleep(2)
clicked = True
continue
except Exception:
pass
# 4. 点击"登录并加入编辑"(文档页面的邀请对话框)
try:
btn = self._page.get_by_role("button", name="登录并加入编辑")
if btn.count() > 0 and btn.first.is_visible(timeout=500):
self.log("[KDocs] 点击登录并加入编辑")
btn.first.click()
time.sleep(1.5)
clicked = True
continue
except Exception:
pass
# 如果没有点击到任何按钮,退出循环
if not clicked:
self.log("[KDocs] 未找到更多可点击的按钮")
break
# 最后确保点击微信扫码登录(切换到扫码模式)
wechat_names = ["微信登录", "微信扫码登录", "扫码登录", "微信扫码"]
for name in wechat_names:
try:
btn = self._page.get_by_role("button", name=name)
if btn.is_visible(timeout=1000):
self.log(f"[KDocs] 点击微信登录: {name}")
btn.click()
time.sleep(1)
return
except Exception:
pass
# 尝试用文本查找微信登录
for name in wechat_names:
try:
el = self._page.get_by_text(name, exact=False).first
if el.is_visible(timeout=500):
self.log(f"[KDocs] 点击微信登录文本: {name}")
el.click()
time.sleep(1)
return
except Exception:
pass
self.log("[KDocs] 未找到登录按钮,可能页面已在登录状态或需要手动操作")
def _capture_qr_image(self) -> Optional[bytes]:
"""捕获登录二维码图片"""
# 查找二维码元素的选择器
selectors = [
"canvas",
"img[src*='qr']",
"img[class*='qr']",
"img[class*='code']",
"div[class*='qr'] img",
"div[class*='qrcode'] img",
"div[class*='scan'] img",
".qrcode img",
".qr-code img",
"img", # 最后尝试所有图片
]
# 先在主页面查找
for selector in selectors:
result = self._try_capture_qr_with_selector(self._page, selector)
if result:
return result
# 尝试在iframe中查找
try:
frames = self._page.frames
for frame in frames:
if frame == self._page.main_frame:
continue
for selector in selectors[:5]: # 只用前几个选择器
result = self._try_capture_qr_with_selector(frame, selector)
if result:
return result
except Exception:
pass
return None
def _try_capture_qr_with_selector(self, page_or_frame, selector: str) -> Optional[bytes]:
"""尝试用指定选择器捕获二维码"""
try:
locator = page_or_frame.locator(selector)
count = locator.count()
for i in range(min(count, 10)):
el = locator.nth(i)
try:
if not el.is_visible(timeout=300):
continue
box = el.bounding_box()
if not box:
continue
w, h = box.get("width", 0), box.get("height", 0)
# 二维码通常是正方形大小在100-400之间
if 80 <= w <= 400 and 80 <= h <= 400 and abs(w - h) < 60:
screenshot = el.screenshot()
if screenshot and len(screenshot) > 500:
return screenshot
except Exception:
continue
except Exception:
pass
return None
def request_qr(self, force: bool = False) -> Dict[str, Any]:
"""
请求登录二维码
Args:
force: 是否强制重新登录
Returns:
{
"success": bool,
"logged_in": bool, # 是否已登录
"qr_image": str, # base64编码的二维码图片
"error": str # 错误信息
}
"""
from config import get_config, KDOCS_LOGIN_STATE_FILE
config = get_config()
doc_url = config.kdocs.doc_url.strip()
if not doc_url:
return {"success": False, "error": "未配置金山文档链接"}
if force:
# 清除登录状态
try:
if KDOCS_LOGIN_STATE_FILE.exists():
KDOCS_LOGIN_STATE_FILE.unlink()
except Exception:
pass
self._cleanup_browser()
if not self._ensure_playwright(use_storage_state=not force):
return {"success": False, "error": self._last_error or "浏览器不可用"}
if not self._open_document(doc_url):
return {"success": False, "error": self._last_error or "打开文档失败"}
# 检查是否已登录
self.log(f"[KDocs] 当前页面URL: {self._page.url}")
if not force and self._is_logged_in():
self._logged_in = True
self._save_login_state()
return {"success": True, "logged_in": True, "qr_image": ""}
# 需要登录,获取二维码
self.log("[KDocs] 需要登录,尝试打开登录对话框...")
self._ensure_login_dialog()
time.sleep(2) # 等待登录对话框加载
self.log("[KDocs] 尝试捕获二维码...")
qr_image = None
for i in range(15): # 增加尝试次数
qr_image = self._capture_qr_image()
if qr_image and len(qr_image) > 1024:
self.log(f"[KDocs] 二维码捕获成功,大小: {len(qr_image)} bytes")
break
self.log(f"[KDocs] 第{i+1}次尝试捕获二维码...")
time.sleep(1)
if not qr_image:
# 尝试截取整个页面帮助调试
self.log("[KDocs] 二维码捕获失败,当前页面可能没有显示二维码")
return {"success": False, "error": "二维码获取失败,请检查网络或手动打开金山文档链接确认"}
return {
"success": True,
"logged_in": False,
"qr_image": base64.b64encode(qr_image).decode("ascii"),
}
def check_login_status(self) -> Dict[str, Any]:
"""检查登录状态(不重新打开页面,只检查当前状态)"""
# 如果页面不存在或已关闭,说明还没开始登录流程
if not self._page or self._page.is_closed():
return {"success": False, "logged_in": False, "error": "页面未打开"}
try:
clicked_confirm = False
# 在主页面和所有iframe中查找确认按钮
frames_to_check = [self._page] + list(self._page.frames)
for frame in frames_to_check:
if clicked_confirm:
break
# 尝试点击确认登录按钮微信扫码后PC端需要再点一下确认
confirm_names = ["确认登录", "确定登录", "登录", "确定", "确认", "同意并登录"]
for name in confirm_names:
try:
confirm_btn = frame.get_by_role("button", name=name)
if confirm_btn.count() > 0 and confirm_btn.first.is_visible(timeout=200):
self.log(f"[KDocs] 找到确认按钮: {name}")
confirm_btn.first.click()
clicked_confirm = True
time.sleep(3)
break
except Exception:
pass
# 如果按钮角色没找到,尝试用文本查找
if not clicked_confirm:
for name in confirm_names:
try:
el = frame.get_by_text(name, exact=True)
if el.count() > 0 and el.first.is_visible(timeout=200):
self.log(f"[KDocs] 找到确认文本: {name}")
el.first.click()
clicked_confirm = True
time.sleep(3)
break
except Exception:
pass
# 尝试用CSS选择器查找
if not clicked_confirm:
try:
# WPS登录页面的确认按钮可能的选择器
selectors = [
"button.ant-btn-primary",
"button[type='primary']",
".confirm-btn",
".login-confirm",
".btn-primary",
".wps-btn-primary",
"a.confirm",
"div.confirm",
"[class*='confirm']",
"[class*='login-btn']"
]
for selector in selectors:
btns = frame.locator(selector)
if btns.count() > 0:
for i in range(min(btns.count(), 3)):
btn = btns.nth(i)
try:
if btn.is_visible(timeout=100):
btn_text = btn.inner_text() or ""
if any(kw in btn_text for kw in ["确认", "登录", "确定"]):
self.log(f"[KDocs] 找到按钮(CSS): {btn_text}")
btn.click()
clicked_confirm = True
time.sleep(3)
break
except Exception:
pass
if clicked_confirm:
break
except Exception:
pass
# 如果点击了确认按钮等待页面自动跳转不要reload
if clicked_confirm:
self.log("[KDocs] 已点击确认,等待页面跳转...")
time.sleep(3) # 等待页面自动跳转
# 检查当前URL是否已经到达文档页面
current_url = self._page.url
self.log(f"[KDocs] 当前URL: {current_url}")
# 直接检查URL判断是否已登录
if "kdocs.cn/l/" in current_url and "account.wps.cn" not in current_url:
# 已到达文档页面,登录成功
logged_in = True
# 尝试点击可能存在的"加入编辑"按钮
try:
join_btn = self._page.get_by_role("button", name="登录并加入编辑")
if join_btn.count() > 0 and join_btn.first.is_visible(timeout=500):
self.log("[KDocs] 点击加入编辑")
join_btn.first.click()
time.sleep(1)
except Exception:
pass
else:
# 还在登录页面或其他页面
logged_in = self._is_logged_in()
self._logged_in = logged_in
if logged_in:
self._save_login_state()
self.log("[KDocs] 登录状态检测:已登录")
return {"success": True, "logged_in": logged_in}
except Exception as e:
return {"success": False, "logged_in": False, "error": str(e)}
def _navigate_to_cell(self, cell_address: str):
"""导航到指定单元格"""
try:
name_box = self._page.locator("input.edit-box").first
name_box.click()
name_box.fill(cell_address)
name_box.press("Enter")
except Exception:
name_box = self._page.locator('#root input[type="text"]').first
name_box.click()
name_box.fill(cell_address)
name_box.press("Enter")
time.sleep(0.3)
def _get_current_cell_address(self) -> str:
"""获取当前单元格地址"""
try:
name_box = self._page.locator("input.edit-box").first
value = name_box.input_value()
if value and re.match(r"^[A-Z]+\d+$", value.upper()):
return value.upper()
except Exception:
pass
return ""
def _search_and_get_row(self, search_text: str, expected_col: str = None,
row_start: int = 0, row_end: int = 0) -> int:
"""搜索并获取行号"""
# 打开搜索
self._page.keyboard.press("Control+f")
time.sleep(0.3)
# 输入搜索内容
try:
search_input = self._page.get_by_role("textbox").nth(3)
if search_input.is_visible(timeout=500):
search_input.fill(search_text)
except Exception:
pass
time.sleep(0.2)
# 点击查找
try:
find_btn = self._page.get_by_role("button", name="查找").first
find_btn.click()
except Exception:
self._page.keyboard.press("Enter")
time.sleep(0.3)
# 获取当前位置
self._page.keyboard.press("Escape")
time.sleep(0.3)
address = self._get_current_cell_address()
if not address:
return -1
# 提取行号
match = re.search(r"(\d+)$", address)
if not match:
return -1
row_num = int(match.group(1))
col_letter = "".join(c for c in address if c.isalpha()).upper()
# 检查列
if expected_col and col_letter != expected_col.upper():
return -1
# 检查行范围
if row_start > 0 and row_num < row_start:
return -1
if row_end > 0 and row_num > row_end:
return -1
return row_num
def _upload_image_to_cell(self, row_num: int, image_path: str, image_col: str) -> bool:
"""上传图片到单元格"""
cell_address = f"{image_col}{row_num}"
self._navigate_to_cell(cell_address)
time.sleep(0.3)
# 清除单元格内容
try:
self._page.keyboard.press("Escape")
time.sleep(0.2)
self._page.keyboard.press("Delete")
time.sleep(0.3)
except Exception:
pass
# 插入 -> 图片 -> 单元格图片
try:
insert_btn = self._page.get_by_role("button", name="插入")
insert_btn.click()
time.sleep(0.3)
image_btn = self._page.get_by_role("button", name="图片")
image_btn.click()
time.sleep(0.3)
cell_image_option = self._page.get_by_role("option", name="单元格图片")
cell_image_option.click()
time.sleep(0.2)
local_option = self._page.get_by_role("option", name="本地")
with self._page.expect_file_chooser() as fc_info:
local_option.click()
file_chooser = fc_info.value
file_chooser.set_files(image_path)
time.sleep(2)
self.log(f"[KDocs] 图片已上传到 {cell_address}")
return True
except Exception as e:
self._last_error = f"上传图片失败: {e}"
return False
def upload_image(
self,
image_path: str,
unit: str,
name: str,
) -> Dict[str, Any]:
"""
上传截图到金山文档
Args:
image_path: 图片路径
unit: 县区名(用于定位行)
name: 姓名(用于定位行)
Returns:
{"success": bool, "error": str}
"""
from config import get_config
config = get_config()
kdocs_config = config.kdocs
if not kdocs_config.enabled:
return {"success": False, "error": "金山文档上传未启用"}
doc_url = kdocs_config.doc_url.strip()
if not doc_url:
return {"success": False, "error": "未配置金山文档链接"}
if not unit or not name:
return {"success": False, "error": "缺少县区或姓名"}
if not image_path or not os.path.exists(image_path):
return {"success": False, "error": "图片文件不存在"}
if not self._ensure_playwright():
return {"success": False, "error": self._last_error or "浏览器不可用"}
if not self._open_document(doc_url):
return {"success": False, "error": self._last_error or "打开文档失败"}
if not self._is_logged_in():
return {"success": False, "error": "未登录,请先扫码登录"}
try:
# 选择工作表
if kdocs_config.sheet_name:
try:
tab = self._page.locator("[role='tab']").filter(has_text=kdocs_config.sheet_name)
if tab.count() > 0:
tab.first.click()
time.sleep(0.5)
except Exception:
pass
# 搜索姓名找到行
self.log(f"[KDocs] 搜索人员: {name}")
row_num = self._search_and_get_row(
name,
expected_col=kdocs_config.name_column,
row_start=kdocs_config.row_start,
row_end=kdocs_config.row_end,
)
if row_num < 0:
return {"success": False, "error": f"未找到人员: {name}"}
self.log(f"[KDocs] 找到人员在第 {row_num}")
# 上传图片
if self._upload_image_to_cell(row_num, image_path, kdocs_config.image_column):
return {"success": True}
else:
return {"success": False, "error": self._last_error or "上传失败"}
except Exception as e:
return {"success": False, "error": str(e)}
def clear_login(self):
"""清除登录状态"""
from config import KDOCS_LOGIN_STATE_FILE
try:
if KDOCS_LOGIN_STATE_FILE.exists():
KDOCS_LOGIN_STATE_FILE.unlink()
except Exception:
pass
self._logged_in = False
self._cleanup_browser()
def close(self):
"""关闭上传器"""
self._cleanup_browser()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()
return False
# 全局实例
_uploader: Optional[KDocsUploader] = None
def get_kdocs_uploader() -> KDocsUploader:
"""获取金山文档上传器实例"""
global _uploader
if _uploader is None:
_uploader = KDocsUploader()
return _uploader

324
core/screenshot.py Normal file
View File

@@ -0,0 +1,324 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
截图模块 - 精简版
使用wkhtmltoimage进行网页截图
移除了线程池、复杂重试逻辑,保持简单
"""
import os
import shutil
import subprocess
from datetime import datetime
from typing import Optional, Callable, List, Tuple
from dataclasses import dataclass
from .api_browser import APIBrowser, get_cookie_jar_path, is_cookie_jar_fresh
@dataclass
class ScreenshotResult:
"""截图结果"""
success: bool
filename: str = ""
filepath: str = ""
error_message: str = ""
def _resolve_wkhtmltoimage_path() -> Optional[str]:
"""查找wkhtmltoimage路径"""
from config import get_config
config = get_config()
# 优先使用配置的路径
custom_path = config.screenshot.wkhtmltoimage_path
if custom_path and os.path.exists(custom_path):
return custom_path
# 先尝试PATH
found = shutil.which("wkhtmltoimage")
if found:
return found
# Windows默认安装路径
win_paths = [
r"C:\Program Files\wkhtmltopdf\bin\wkhtmltoimage.exe",
r"C:\Program Files (x86)\wkhtmltopdf\bin\wkhtmltoimage.exe",
os.path.expandvars(r"%ProgramFiles%\wkhtmltopdf\bin\wkhtmltoimage.exe"),
os.path.expandvars(r"%ProgramFiles(x86)%\wkhtmltopdf\bin\wkhtmltoimage.exe"),
]
for p in win_paths:
if os.path.exists(p):
return p
return None
def _read_cookie_pairs(cookies_path: str) -> List[Tuple[str, str]]:
"""读取cookie文件"""
if not cookies_path or not os.path.exists(cookies_path):
return []
pairs = []
try:
with open(cookies_path, "r", encoding="utf-8", errors="ignore") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
parts = line.split("\t")
if len(parts) < 7:
continue
name = parts[5].strip()
value = parts[6].strip()
if name:
pairs.append((name, value))
except Exception:
return []
return pairs
def _select_cookie_pairs(pairs: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
"""选择关键cookie"""
preferred_names = {"ASP.NET_SessionId", ".ASPXAUTH"}
preferred = [(name, value) for name, value in pairs if name in preferred_names and value]
if preferred:
return preferred
return [(name, value) for name, value in pairs if name and value and name.isascii() and value.isascii()]
def take_screenshot_wkhtmltoimage(
url: str,
output_path: str,
cookies_path: Optional[str] = None,
proxy_server: Optional[str] = None,
run_script: Optional[str] = None,
window_status: Optional[str] = None,
log_callback: Optional[Callable] = None,
) -> bool:
"""
使用wkhtmltoimage截图
Args:
url: 要截图的URL
output_path: 输出文件路径
cookies_path: cookie文件路径
proxy_server: 代理服务器
run_script: 运行的JavaScript脚本
window_status: 等待的window.status值
log_callback: 日志回调
Returns:
是否成功
"""
from config import get_config
config = get_config()
screenshot_config = config.screenshot
wkhtmltoimage_path = _resolve_wkhtmltoimage_path()
if not wkhtmltoimage_path:
if log_callback:
log_callback("wkhtmltoimage 未安装或不在 PATH 中")
return False
ext = os.path.splitext(output_path)[1].lower()
image_format = "jpg" if ext in (".jpg", ".jpeg") else "png"
cmd = [
wkhtmltoimage_path,
"--format", image_format,
"--width", str(screenshot_config.width),
"--disable-smart-width",
"--javascript-delay", str(screenshot_config.js_delay_ms),
"--load-error-handling", "ignore",
"--enable-local-file-access",
"--encoding", "utf-8",
]
# User-Agent
ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
cmd.extend(["--custom-header", "User-Agent", ua, "--custom-header-propagation"])
# 图片质量
if image_format in ("jpg", "jpeg"):
cmd.extend(["--quality", str(screenshot_config.quality)])
# 高度
if screenshot_config.height > 0:
cmd.extend(["--height", str(screenshot_config.height)])
# 自定义脚本
if run_script:
cmd.extend(["--run-script", run_script])
if window_status:
cmd.extend(["--window-status", window_status])
# Cookies
if cookies_path:
cookie_pairs = _select_cookie_pairs(_read_cookie_pairs(cookies_path))
if cookie_pairs:
for name, value in cookie_pairs:
cmd.extend(["--cookie", name, value])
else:
cmd.extend(["--cookie-jar", cookies_path])
# 代理
if proxy_server:
cmd.extend(["--proxy", proxy_server])
cmd.extend([url, output_path])
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=screenshot_config.timeout_seconds
)
if result.returncode != 0:
if log_callback:
err_msg = (result.stderr or result.stdout or "").strip()
log_callback(f"wkhtmltoimage 截图失败: {err_msg[:200]}")
return False
return True
except subprocess.TimeoutExpired:
if log_callback:
log_callback("wkhtmltoimage 截图超时")
return False
except Exception as e:
if log_callback:
log_callback(f"wkhtmltoimage 截图异常: {e}")
return False
def take_screenshot(
username: str,
password: str,
browse_type: str = "应读",
remark: str = "",
log_callback: Optional[Callable] = None,
proxy_config: Optional[dict] = None,
) -> ScreenshotResult:
"""
为账号执行完整的截图流程
Args:
username: 用户名
password: 密码
browse_type: 浏览类型
remark: 账号备注(用于文件名)
log_callback: 日志回调
proxy_config: 代理配置
Returns:
截图结果
"""
from config import get_config, SCREENSHOTS_DIR
config = get_config()
result = ScreenshotResult(success=False)
def log(msg: str):
if log_callback:
log_callback(msg)
# 确保截图目录存在
SCREENSHOTS_DIR.mkdir(exist_ok=True)
# 获取或刷新cookies
cookie_path = get_cookie_jar_path(username)
proxy_server = proxy_config.get("server") if proxy_config else None
if not is_cookie_jar_fresh(cookie_path):
log("正在登录获取Cookie...")
with APIBrowser(log_callback=log, proxy_config=proxy_config) as browser:
if not browser.login(username, password):
result.error_message = "登录失败"
return result
if not browser.save_cookies_for_screenshot(username):
result.error_message = "保存Cookie失败"
return result
log(f"导航到 '{browse_type}' 页面...")
# 构建截图URL
from urllib.parse import urlsplit
parsed = urlsplit(config.zsgl.login_url)
base = f"{parsed.scheme}://{parsed.netloc}"
bz = 0 # 应读
target_url = f"{base}/admin/center.aspx?bz={bz}"
index_url = f"{base}/admin/index.aspx"
# 生成文件名
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
account_name = remark if remark else username
screenshot_filename = f"{account_name}_{browse_type}_{timestamp}.jpg"
screenshot_path = str(SCREENSHOTS_DIR / screenshot_filename)
# 构建JavaScript注入脚本用于正确显示页面
run_script = (
"(function(){"
"function done(){window.status='ready';}"
"function ensureNav(){try{if(typeof loadMenuTree==='function'){loadMenuTree(true);}}catch(e){}}"
"function expandMenu(){"
"try{var body=document.body;if(body&&body.classList.contains('lay-mini')){body.classList.remove('lay-mini');}}catch(e){}"
"try{if(typeof mainPageResize==='function'){mainPageResize();}}catch(e){}"
"}"
"function navReady(){"
"try{var nav=document.getElementById('sidebar-nav');return nav && nav.querySelectorAll('a').length>0;}catch(e){return false;}"
"}"
"function frameReady(){"
"try{var f=document.getElementById('mainframe');return f && f.contentDocument && f.contentDocument.readyState==='complete';}catch(e){return false;}"
"}"
"function check(){"
"if(navReady() && frameReady()){done();return;}"
"setTimeout(check,300);"
"}"
"var f=document.getElementById('mainframe');"
"ensureNav();"
"expandMenu();"
"if(!f){done();return;}"
f"f.src='{target_url}';"
"f.onload=function(){ensureNav();expandMenu();setTimeout(check,300);};"
"setTimeout(check,5000);"
"})();"
)
# 尝试截图(先尝试完整页面,失败则直接截目标页)
log("正在截图...")
cookies_for_shot = cookie_path if is_cookie_jar_fresh(cookie_path) else None
success = take_screenshot_wkhtmltoimage(
index_url,
screenshot_path,
cookies_path=cookies_for_shot,
proxy_server=proxy_server,
run_script=run_script,
window_status="ready",
log_callback=log,
)
if not success:
# 备选:直接截目标页
log("尝试直接截图目标页...")
success = take_screenshot_wkhtmltoimage(
target_url,
screenshot_path,
cookies_path=cookies_for_shot,
proxy_server=proxy_server,
log_callback=log,
)
if success and os.path.exists(screenshot_path) and os.path.getsize(screenshot_path) > 1000:
log(f"[OK] 截图成功: {screenshot_filename}")
result.success = True
result.filename = screenshot_filename
result.filepath = screenshot_path
else:
result.error_message = "截图失败"
if os.path.exists(screenshot_path):
os.remove(screenshot_path)
return result