Files
xb/app/server.py

3349 lines
108 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
from __future__ import annotations
import argparse
from concurrent.futures import ThreadPoolExecutor, as_completed
import ctypes
import gc
import hashlib
import io
import json
import os
import re
import shutil
import subprocess
import threading
import time
import uuid
from datetime import datetime, timedelta
from http import HTTPStatus
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
from pathlib import Path
from typing import Any, Callable
from urllib.parse import quote, urlparse
from pptx.enum.text import PP_ALIGN
from pptx import Presentation
from api_get_routes import handle_get_routes
from api_post_routes import handle_post_routes
BASE_DIR = Path(__file__).resolve().parent
STATIC_DIR = BASE_DIR / "static"
CONFIG_PATH = BASE_DIR / "config" / "xibao_logic.json"
DATA_DIR = BASE_DIR / "data"
DEFAULT_HISTORY_PATH = DATA_DIR / "generated_history.json"
DEFAULT_OUTPUT_DIR = BASE_DIR / "output"
DEFAULT_TEMPLATE_FALLBACK = BASE_DIR.parent / "黄金三十天喜报模版(余额、保险、理财)(1).pptx"
REVIEW_LOG_DIR = DATA_DIR / "review_logs"
MANUAL_RULES_PATH = DATA_DIR / "manual_rules.json"
ISSUE_MARKS_PATH = DATA_DIR / "marked_issues.json"
SKIPPED_SUPPRESS_PATH = DATA_DIR / "skipped_suppressions.json"
CONVERTER_IMAGE = "minidocks/libreoffice"
LOCAL_LIBREOFFICE_BIN = shutil.which("libreoffice") or shutil.which("soffice")
LOCAL_PDFTOPPM_BIN = shutil.which("pdftoppm")
DATA_DIR.mkdir(parents=True, exist_ok=True)
DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
REVIEW_LOG_DIR.mkdir(parents=True, exist_ok=True)
_HISTORY_LOCK = threading.Lock()
_DOWNLOAD_LOCK = threading.Lock()
_CONVERTER_LOCK = threading.Lock()
_LOG_LOCK = threading.Lock()
_PROGRESS_LOCK = threading.Lock()
_RULES_LOCK = threading.Lock()
_ISSUE_LOCK = threading.Lock()
_SKIP_SUPPRESS_LOCK = threading.Lock()
_ACTIVE_JOB_LOCK = threading.Lock()
_GENERATE_SLOT_LOCK = threading.Lock()
_GENERATE_STATE_LOCK = threading.Lock()
DOWNLOAD_CACHE: dict[str, dict[str, Any]] = {}
GEN_PROGRESS: dict[str, dict[str, Any]] = {}
ACTIVE_JOB_DIRS: set[str] = set()
ACTIVE_GENERATION: dict[str, Any] = {"token": "", "started_at": 0.0}
CONVERTER_IMAGE_READY = False
_MALLOC_TRIM_FN: Callable[[int], int] | None = None
TERM_DEFS = [
{
"type": "三个月定期",
"page": "page_0",
"regex": re.compile(r"(?:三|3)\s*个?月"),
},
{
"type": "六个月定期",
"page": "page_1",
"regex": re.compile(r"(?:六|6)\s*个?月|半年"),
},
{
"type": "一年期定期",
"page": "page_2",
"regex": re.compile(r"(?:定期)?\s*(?:存\s*)?(?:一|1)\s*年(?:期)?|定期一年|存1年"),
},
]
CHINESE_DIGIT_MAP = {
"": 0,
"": 0,
"": 1,
"": 2,
"": 2,
"": 3,
"": 4,
"": 5,
"": 6,
"": 7,
"": 8,
"": 9,
"": 1,
"": 2,
"": 3,
"": 4,
"": 5,
"": 6,
"": 7,
"": 8,
"": 9,
}
CHINESE_UNIT_MAP = {"": 10, "": 100, "": 1000, "": 10000, "亿": 100000000}
CHINESE_NUMBER_RE = re.compile(r"[零〇一二两三四五六七八九十百千万亿壹贰叁肆伍陆柒捌玖拾佰仟点點]+")
STATUS_HEURISTIC_PATTERNS: dict[str, list[re.Pattern[str]]] = {
"揽收现金": [
re.compile(r"揽收[^,。,;\n]{0,24}现金"),
],
"揽收彩礼": [
re.compile(r"揽收[^,。,;\n]{0,24}(?:彩礼|礼金)"),
re.compile(r"(?:彩礼|礼金)[^,。,;\n]{0,24}揽收"),
],
"微信提现": [
re.compile(r"(?:微信|支付宝)[^,。,;\n]{0,16}提现"),
re.compile(r"提现[^,。,;\n]{0,16}(?:微信|支付宝)"),
],
"商户提现": [
re.compile(r"商户[^,。,;\n]{0,16}提现"),
re.compile(r"提现[^,。,;\n]{0,16}商户"),
],
"揽收商户": [
re.compile(r"揽收[^,。,;\n]{0,16}商户"),
re.compile(r"商户[^,。,;\n]{0,16}揽收"),
],
"揽收他行": [
re.compile(r"揽收[^,。,;\n]{0,20}他行"),
re.compile(r"他行[^,。,;\n]{0,20}揽收"),
],
"他行挖转": [
re.compile(r"挖转[^,。,;\n]{0,20}他行"),
re.compile(r"他行[^,。,;\n]{0,20}挖转"),
re.compile(r"挖他行"),
],
}
class ConfigError(RuntimeError):
pass
def now_ts() -> str:
return datetime.now().strftime("%Y%m%d_%H%M%S")
def load_config() -> dict[str, Any]:
if not CONFIG_PATH.exists():
raise ConfigError(f"config file not found: {CONFIG_PATH}")
with CONFIG_PATH.open("r", encoding="utf-8") as f:
config = json.load(f)
if not isinstance(config, dict):
raise ConfigError("config root must be a JSON object")
return config
def is_windows_path(path_text: str) -> bool:
return bool(re.match(r"^[A-Za-z]:\\", path_text))
def resolve_history_path(config: dict[str, Any]) -> Path:
history_file = str(config.get("history_file", "")).strip()
if history_file and not is_windows_path(history_file):
p = Path(history_file)
if not p.is_absolute():
p = BASE_DIR / p
p.parent.mkdir(parents=True, exist_ok=True)
return p
DEFAULT_HISTORY_PATH.parent.mkdir(parents=True, exist_ok=True)
return DEFAULT_HISTORY_PATH
def resolve_template_path(config: dict[str, Any], override_path: str | None = None) -> Path:
candidates: list[Path] = []
if override_path:
p = Path(override_path)
if not p.is_absolute():
p = BASE_DIR / p
candidates.append(p)
cfg_template = str(config.get("template_file", "")).strip()
if cfg_template and not is_windows_path(cfg_template):
p = Path(cfg_template)
if not p.is_absolute():
p = BASE_DIR / p
candidates.append(p)
env_template = os.environ.get("XIBAO_TEMPLATE_FILE", "").strip()
if env_template:
p = Path(env_template)
if not p.is_absolute():
p = BASE_DIR / p
candidates.append(p)
candidates.append(DEFAULT_TEMPLATE_FALLBACK)
for p in candidates:
if p.exists() and p.is_file():
return p
raise ConfigError(
"template file not found. Set config.template_file to a valid local path "
"or pass template_file in request body."
)
def resolve_output_dir(config: dict[str, Any], override_path: str | None = None) -> Path:
if override_path:
p = Path(override_path)
if not p.is_absolute():
p = BASE_DIR / p
p.mkdir(parents=True, exist_ok=True)
return p
output_dir = str(config.get("output_dir", "")).strip()
if output_dir and not is_windows_path(output_dir):
p = Path(output_dir)
if not p.is_absolute():
p = BASE_DIR / p
p.mkdir(parents=True, exist_ok=True)
return p
DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
return DEFAULT_OUTPUT_DIR
def load_history(history_path: Path) -> list[dict[str, Any]]:
if not history_path.exists():
return []
with history_path.open("r", encoding="utf-8") as f:
raw = json.load(f)
if isinstance(raw, list):
return [r for r in raw if isinstance(r, dict)]
if isinstance(raw, dict) and isinstance(raw.get("records"), list):
return [r for r in raw["records"] if isinstance(r, dict)]
return []
def save_history(history_path: Path, records: list[dict[str, Any]]) -> None:
with history_path.open("w", encoding="utf-8") as f:
json.dump(records, f, ensure_ascii=False, indent=2)
def load_manual_rules() -> list[dict[str, Any]]:
if not MANUAL_RULES_PATH.exists():
return []
with _RULES_LOCK:
try:
with MANUAL_RULES_PATH.open("r", encoding="utf-8") as f:
raw = json.load(f)
except Exception:
return []
if not isinstance(raw, list):
return []
return [x for x in raw if isinstance(x, dict)]
def save_manual_rules(rules: list[dict[str, Any]]) -> None:
with _RULES_LOCK:
with MANUAL_RULES_PATH.open("w", encoding="utf-8") as f:
json.dump(rules, f, ensure_ascii=False, indent=2)
def _load_issue_marks_unlocked() -> list[dict[str, Any]]:
if not ISSUE_MARKS_PATH.exists():
return []
try:
with ISSUE_MARKS_PATH.open("r", encoding="utf-8") as f:
raw = json.load(f)
except Exception:
return []
if not isinstance(raw, list):
return []
return [x for x in raw if isinstance(x, dict)]
def _save_issue_marks_unlocked(items: list[dict[str, Any]]) -> None:
ISSUE_MARKS_PATH.parent.mkdir(parents=True, exist_ok=True)
with ISSUE_MARKS_PATH.open("w", encoding="utf-8") as f:
json.dump(items, f, ensure_ascii=False, indent=2)
def list_issue_marks(status: str = "active", limit: int = 500) -> tuple[list[dict[str, Any]], int]:
status_norm = str(status or "active").strip().lower()
if status_norm not in {"active", "resolved", "all"}:
status_norm = "active"
limit = max(1, min(2000, int(limit)))
with _ISSUE_LOCK:
items = _load_issue_marks_unlocked()
normalized: list[dict[str, Any]] = []
for item in items:
obj = dict(item)
item_status = str(obj.get("status", "active")).strip().lower()
if item_status not in {"active", "resolved"}:
item_status = "active"
obj["status"] = item_status
normalized.append(obj)
if status_norm == "all":
filtered = normalized
else:
filtered = [x for x in normalized if str(x.get("status", "active")) == status_norm]
filtered.sort(
key=lambda x: str(x.get("updated_at") or x.get("created_at") or ""),
reverse=True,
)
total = len(filtered)
return filtered[:limit], total
def upsert_issue_mark(
*,
mark_type: str,
source_line: str,
note: str = "",
record: dict[str, Any] | None = None,
) -> tuple[dict[str, Any], bool]:
mark_type_text = str(mark_type).strip()
if mark_type_text not in {"recognition_error", "generation_error"}:
raise ValueError("mark_type must be recognition_error or generation_error")
line_text = str(source_line).strip()
if not line_text:
raise ValueError("source_line is required")
now = datetime.now().isoformat(timespec="seconds")
record_obj = dict(record) if isinstance(record, dict) else {}
note_text = str(note).strip()
with _ISSUE_LOCK:
items = _load_issue_marks_unlocked()
for item in items:
if not isinstance(item, dict):
continue
if str(item.get("status", "active")).strip().lower() != "active":
continue
if str(item.get("mark_type", "")).strip() != mark_type_text:
continue
if str(item.get("source_line", "")).strip() != line_text:
continue
item["note"] = note_text
item["record"] = record_obj
item["updated_at"] = now
_save_issue_marks_unlocked(items)
return dict(item), False
issue = {
"id": uuid.uuid4().hex[:12],
"mark_type": mark_type_text,
"source_line": line_text,
"note": note_text,
"record": record_obj,
"status": "active",
"created_at": now,
"updated_at": now,
}
items.append(issue)
_save_issue_marks_unlocked(items)
return dict(issue), True
def update_issue_mark(
*,
issue_id: str,
mark_type: str | None = None,
source_line: str | None = None,
note: str | None = None,
record: dict[str, Any] | None = None,
) -> dict[str, Any] | None:
issue_id_text = str(issue_id).strip()
if not issue_id_text:
raise ValueError("id is required")
now = datetime.now().isoformat(timespec="seconds")
with _ISSUE_LOCK:
items = _load_issue_marks_unlocked()
target = None
for item in items:
if str(item.get("id", "")).strip() == issue_id_text:
target = item
break
if target is None:
return None
if mark_type is not None:
mark_type_text = str(mark_type).strip()
if mark_type_text not in {"recognition_error", "generation_error"}:
raise ValueError("mark_type must be recognition_error or generation_error")
target["mark_type"] = mark_type_text
if source_line is not None:
source_line_text = str(source_line).strip()
if not source_line_text:
raise ValueError("source_line is required")
target["source_line"] = source_line_text
if note is not None:
target["note"] = str(note).strip()
if record is not None:
target["record"] = dict(record) if isinstance(record, dict) else {}
target["updated_at"] = now
_save_issue_marks_unlocked(items)
return dict(target)
def delete_issue_mark(issue_id: str) -> bool:
issue_id_text = str(issue_id).strip()
if not issue_id_text:
raise ValueError("id is required")
with _ISSUE_LOCK:
items = _load_issue_marks_unlocked()
remain: list[dict[str, Any]] = []
deleted = False
for item in items:
if str(item.get("id", "")).strip() == issue_id_text:
deleted = True
continue
remain.append(item)
if deleted:
_save_issue_marks_unlocked(remain)
return deleted
def resolve_issue_marks_by_source_line(source_line: str, reason: str = "") -> dict[str, Any]:
line_text = str(source_line).strip()
if not line_text:
return {"count": 0, "ids": []}
now = datetime.now().isoformat(timespec="seconds")
resolved_ids: list[str] = []
with _ISSUE_LOCK:
items = _load_issue_marks_unlocked()
changed = False
for item in items:
if str(item.get("status", "active")).strip().lower() != "active":
continue
if str(item.get("source_line", "")).strip() != line_text:
continue
item["status"] = "resolved"
item["resolved_at"] = now
item["updated_at"] = now
if reason:
item["resolved_reason"] = reason
resolved_ids.append(str(item.get("id", "")).strip())
changed = True
if changed:
_save_issue_marks_unlocked(items)
return {"count": len([x for x in resolved_ids if x]), "ids": [x for x in resolved_ids if x]}
def normalize_skip_line(line: str) -> str:
return re.sub(r"\s+", "", str(line or "").strip())
def skip_suppression_key(line: str, reason: str = "") -> str:
normalized_line = normalize_skip_line(line)
normalized_reason = str(reason or "").strip()
if not normalized_line:
return ""
return f"{normalized_reason}|{normalized_line}"
def skip_suppression_keys_for_item(line: str, reason: str = "") -> list[str]:
normalized_line = normalize_skip_line(line)
if not normalized_line:
return []
normalized_reason = str(reason or "").strip()
keys = [f"*|{normalized_line}"]
if normalized_reason:
keys.insert(0, f"{normalized_reason}|{normalized_line}")
return keys
def _load_skip_suppressions_unlocked() -> list[dict[str, Any]]:
if not SKIPPED_SUPPRESS_PATH.exists():
return []
try:
with SKIPPED_SUPPRESS_PATH.open("r", encoding="utf-8") as f:
raw = json.load(f)
except Exception:
return []
if not isinstance(raw, list):
return []
return [x for x in raw if isinstance(x, dict)]
def _save_skip_suppressions_unlocked(items: list[dict[str, Any]]) -> None:
SKIPPED_SUPPRESS_PATH.parent.mkdir(parents=True, exist_ok=True)
with SKIPPED_SUPPRESS_PATH.open("w", encoding="utf-8") as f:
json.dump(items, f, ensure_ascii=False, indent=2)
def list_skip_suppressions() -> list[dict[str, Any]]:
with _SKIP_SUPPRESS_LOCK:
return _load_skip_suppressions_unlocked()
def build_skip_suppression_lookup(items: list[dict[str, Any]]) -> set[str]:
keys: set[str] = set()
for item in items:
if not isinstance(item, dict):
continue
key = str(item.get("key", "")).strip()
if key:
keys.add(key)
continue
line = str(item.get("line", "")).strip()
reason = str(item.get("reason", "")).strip()
fallback = skip_suppression_key(line, reason)
if fallback:
keys.add(fallback)
return keys
def is_skip_item_suppressed(line: str, reason: str, lookup: set[str]) -> bool:
for key in skip_suppression_keys_for_item(line, reason):
if key in lookup:
return True
return False
def suppress_skip_item(line: str, reason: str = "") -> tuple[dict[str, Any], bool]:
line_text = str(line or "").strip()
if not line_text:
raise ValueError("line is required")
reason_text = str(reason or "").strip()
# "*" means suppress the same line regardless of reason.
normalized_reason = reason_text if reason_text else "*"
key = skip_suppression_key(line_text, normalized_reason)
if not key:
raise ValueError("line is required")
now = datetime.now().isoformat(timespec="seconds")
with _SKIP_SUPPRESS_LOCK:
items = _load_skip_suppressions_unlocked()
for item in items:
if str(item.get("key", "")).strip() != key:
continue
item["updated_at"] = now
_save_skip_suppressions_unlocked(items)
return dict(item), False
obj = {
"id": uuid.uuid4().hex[:12],
"key": key,
"line": line_text,
"reason": normalized_reason,
"created_at": now,
"updated_at": now,
}
items.append(obj)
_save_skip_suppressions_unlocked(items)
return obj, True
def clear_skip_suppressions() -> int:
with _SKIP_SUPPRESS_LOCK:
items = _load_skip_suppressions_unlocked()
count = len(items)
_save_skip_suppressions_unlocked([])
return count
def register_active_job_dir(job_dir: Path) -> None:
with _ACTIVE_JOB_LOCK:
ACTIVE_JOB_DIRS.add(str(job_dir.resolve()))
def unregister_active_job_dir(job_dir: Path) -> None:
with _ACTIVE_JOB_LOCK:
ACTIVE_JOB_DIRS.discard(str(job_dir.resolve()))
def get_active_job_dirs() -> set[str]:
with _ACTIVE_JOB_LOCK:
return set(ACTIVE_JOB_DIRS)
def acquire_generation_slot(token: str = "") -> bool:
if not _GENERATE_SLOT_LOCK.acquire(blocking=False):
return False
with _GENERATE_STATE_LOCK:
ACTIVE_GENERATION["token"] = str(token).strip()
ACTIVE_GENERATION["started_at"] = time.time()
return True
def release_generation_slot(token: str = "") -> None:
_ = token
with _GENERATE_STATE_LOCK:
ACTIVE_GENERATION["token"] = ""
ACTIVE_GENERATION["started_at"] = 0.0
if _GENERATE_SLOT_LOCK.locked():
try:
_GENERATE_SLOT_LOCK.release()
except RuntimeError:
pass
def get_active_generation() -> dict[str, Any]:
with _GENERATE_STATE_LOCK:
return dict(ACTIVE_GENERATION)
def today_log_path() -> Path:
day = datetime.now().strftime("%Y-%m-%d")
return REVIEW_LOG_DIR / f"review_{day}.jsonl"
def count_log_lines(path: Path) -> int:
if not path.exists():
return 0
n = 0
with path.open("r", encoding="utf-8") as f:
for _ in f:
n += 1
return n
def append_review_log(event: str, payload: dict[str, Any] | None = None) -> Path:
REVIEW_LOG_DIR.mkdir(parents=True, exist_ok=True)
log_path = today_log_path()
entry = {
"ts": datetime.now().isoformat(timespec="seconds"),
"event": str(event),
}
if isinstance(payload, dict):
entry.update(payload)
with _LOG_LOCK:
with log_path.open("a", encoding="utf-8") as f:
f.write(json.dumps(entry, ensure_ascii=False))
f.write("\n")
return log_path
def get_valid_status_list(config: dict[str, Any]) -> list[str]:
valid_status = config.get("replace_algorithm", {}).get("status", {}).get("valid_status", [])
if not isinstance(valid_status, list) or not valid_status:
valid_status = config.get("status_extraction", {}).get("valid_status", [])
if not isinstance(valid_status, list):
return []
return [str(x).strip() for x in valid_status if str(x).strip()]
def normalize_status_value(status: str, config: dict[str, Any]) -> str:
fallback = str(config.get("status_extraction", {}).get("fallback", "成功营销")).strip() or "成功营销"
valid_status = get_valid_status_list(config)
if not valid_status:
return str(status).strip() or fallback
s = str(status).strip()
if s in valid_status:
return s
return fallback if fallback in valid_status else valid_status[0]
def is_date_header_line(normalized_line: str) -> bool:
return bool(re.fullmatch(r"\d{1,2}月\d{1,2}[日号]?", normalized_line))
def log_parse_skipped(skipped: list[dict[str, Any]], source: str) -> int:
written = 0
for item in skipped:
if not isinstance(item, dict):
continue
reason = str(item.get("reason", "")).strip()
# 说明行不记录,避免日志噪音。
if not reason or reason == "skip_line_rule":
continue
source_line = str(item.get("line", "")).strip()
append_review_log(
"parse_skip",
{
"source": source,
"reason": reason,
"source_line": source_line,
},
)
written += 1
return written
def is_path_under(path: Path, parent: Path) -> bool:
try:
path.resolve().relative_to(parent.resolve())
return True
except Exception:
return False
def cleanup_output_artifacts(output_dir: Path) -> dict[str, int]:
output_dir.mkdir(parents=True, exist_ok=True)
removed_dirs = 0
removed_files = 0
errors = 0
active_dirs = get_active_job_dirs()
for entry in output_dir.iterdir():
try:
if entry.is_dir() and entry.name.startswith("job_"):
if str(entry.resolve()) in active_dirs:
continue
shutil.rmtree(entry)
removed_dirs += 1
continue
if entry.is_file() and entry.suffix.lower() in {".png", ".pdf", ".pptx", ".zip"}:
entry.unlink(missing_ok=True)
removed_files += 1
except Exception:
errors += 1
with _DOWNLOAD_LOCK:
stale_tokens: list[str] = []
for token, meta in DOWNLOAD_CACHE.items():
file_path = Path(str(meta.get("file_path", "")))
if not file_path.exists() or is_path_under(file_path, output_dir):
stale_tokens.append(token)
for token in stale_tokens:
DOWNLOAD_CACHE.pop(token, None)
return {
"removed_dirs": removed_dirs,
"removed_files": removed_files,
"errors": errors,
}
def daily_cleanup_loop(stop_event: threading.Event) -> None:
while not stop_event.is_set():
now = datetime.now()
next_midnight = (now + timedelta(days=1)).replace(hour=0, minute=0, second=0, microsecond=0)
wait_seconds = max((next_midnight - now).total_seconds(), 1.0)
if stop_event.wait(wait_seconds):
break
try:
config = load_config()
output_dir = resolve_output_dir(config)
stat = cleanup_output_artifacts(output_dir)
print(
"[daily-cleanup] done at midnight, "
f"removed_dirs={stat['removed_dirs']} removed_files={stat['removed_files']} errors={stat['errors']}"
)
append_review_log(
"daily_cleanup",
{
"output_dir": str(output_dir),
**stat,
},
)
except Exception as exc:
print(f"[daily-cleanup] failed: {exc}")
append_review_log("daily_cleanup_error", {"error": str(exc)})
def build_history_lookup(history: list[dict[str, Any]], key_fields: list[str]) -> dict[str, dict[str, Any]]:
lookup: dict[str, dict[str, Any]] = {}
for item in history:
if not isinstance(item, dict):
continue
k = key_for_record(item, key_fields)
if not k:
continue
existing = lookup.get(k)
if existing is None:
lookup[k] = item
continue
prev_ts = str(existing.get("created_at", ""))
curr_ts = str(item.get("created_at", ""))
if curr_ts >= prev_ts:
lookup[k] = item
return lookup
def build_history_signature_lookup(history: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
lookup: dict[str, dict[str, Any]] = {}
for item in history:
if not isinstance(item, dict):
continue
k = signature_key_for_record(item)
if not k:
continue
existing = lookup.get(k)
if existing is None:
lookup[k] = item
continue
prev_ts = str(existing.get("created_at", ""))
curr_ts = str(item.get("created_at", ""))
if curr_ts >= prev_ts:
lookup[k] = item
return lookup
def resolve_history_image_path(history_record: dict[str, Any], config: dict[str, Any]) -> Path | None:
candidates: list[Path] = []
image_path = str(history_record.get("image_path", "")).strip()
if image_path:
p = Path(image_path)
if not p.is_absolute():
p = BASE_DIR / p
candidates.append(p)
output_dir = resolve_output_dir(config)
png_file = str(history_record.get("png_file", "")).strip()
output_file = str(history_record.get("output_file", "")).strip()
job_id = str(history_record.get("job_id", "")).strip()
if png_file and job_id:
candidates.append(output_dir / job_id / png_file)
if output_file and job_id:
candidates.append(output_dir / job_id / output_file)
if output_file:
# 兼容老历史记录:仅存 output_file 时尝试在 job_* 中定位。
for p in output_dir.glob(f"job_*/{output_file}"):
candidates.append(p)
break
seen: set[str] = set()
for c in candidates:
key = str(c)
if key in seen:
continue
seen.add(key)
if c.exists() and c.is_file():
return c
return None
def attach_duplicate_download_info(
dup_record: dict[str, Any],
history_record: dict[str, Any] | None,
config: dict[str, Any],
) -> None:
if not isinstance(history_record, dict):
return
image_path = resolve_history_image_path(history_record, config)
if not image_path:
return
token = register_download(image_path, content_type="image/png")
dup_record["download_token"] = token
dup_record["download_url"] = f"/api/download/{token}"
dup_record["image_name"] = image_path.name
if not dup_record.get("output_file"):
dup_record["output_file"] = image_path.name
def build_history_view_items(
history: list[dict[str, Any]],
config: dict[str, Any],
limit: int = 300,
) -> list[dict[str, Any]]:
items = [x for x in history if isinstance(x, dict)]
items.sort(key=lambda x: str(x.get("created_at", "")), reverse=True)
if limit > 0:
items = items[:limit]
rows: list[dict[str, Any]] = []
for item in items:
row = dict(item)
image_path = resolve_history_image_path(row, config)
if image_path:
token = register_download(image_path, content_type="image/png")
row["download_token"] = token
row["download_url"] = f"/api/download/{token}"
row["image_exists"] = True
row["image_name"] = image_path.name
else:
row["image_exists"] = False
rows.append(row)
return rows
def _cleanup_progress_cache_locked() -> None:
now = time.time()
stale: list[str] = []
for token, item in GEN_PROGRESS.items():
updated_at = float(item.get("updated_at", 0))
if now - updated_at > 3 * 3600:
stale.append(token)
for token in stale:
GEN_PROGRESS.pop(token, None)
def cleanup_progress_cache() -> None:
with _PROGRESS_LOCK:
_cleanup_progress_cache_locked()
def set_generation_progress(
token: str,
*,
status: str,
stage: str,
percent: int,
detail: str = "",
error: str = "",
) -> None:
t = str(token).strip()
if not t:
return
now_iso = datetime.now().isoformat(timespec="seconds")
item = {
"token": t,
"status": status,
"stage": stage,
"percent": max(0, min(100, int(percent))),
"detail": detail,
"error": error,
"updated_at": time.time(),
"updated_at_text": now_iso,
}
with _PROGRESS_LOCK:
_cleanup_progress_cache_locked()
GEN_PROGRESS[t] = item
def get_generation_progress(token: str) -> dict[str, Any] | None:
t = str(token).strip()
if not t:
return None
with _PROGRESS_LOCK:
_cleanup_progress_cache_locked()
item = GEN_PROGRESS.get(t)
if item is None:
return None
return dict(item)
def strip_trailing_zero(num_text: str) -> str:
if "." not in num_text:
return num_text
return num_text.rstrip("0").rstrip(".")
def floor_amount_number(value: Any) -> int | None:
s = str(value).strip()
if not s:
return None
m = re.search(r"\d+(?:\.\d+)?", s)
if not m:
return None
try:
num = float(m.group(0))
except ValueError:
return None
return max(0, int(num))
def normalize_amount_text(value: Any) -> str:
v = floor_amount_number(value)
if v is None:
return ""
return f"{v}"
def normalize_line(line: str, line_pattern: str) -> str:
line = line.strip()
if not line:
return ""
try:
line = re.sub(line_pattern, "", line)
except re.error:
pass
line = re.sub(r"\s+", "", line)
return line.strip()
def extract_line_serial(source_line: str, line_pattern: str) -> str:
line = str(source_line).strip()
if not line:
return ""
# Prefer extracting from configured line prefix regex.
try:
m = re.match(line_pattern, line)
if m:
prefix = m.group(0)
dm = re.search(r"\d+", prefix)
if dm:
return str(int(dm.group(0)))
except re.error:
pass
# Fallback: common numbering formats (e.g. "12、", "12.", "12)").
dm = re.match(r"^\s*(\d+)\s*(?:[、,.\)]\s*)?", line)
if dm:
return str(int(dm.group(1)))
return ""
def line_signature(normalized_line: str) -> str:
text = re.sub(r"\s+", "", str(normalized_line).strip())
if not text:
return ""
digest = hashlib.sha1(text.encode("utf-8")).hexdigest()
return digest[:16]
def extract_branch(line: str, config: dict[str, Any]) -> tuple[str | None, str | None]:
branches_cfg = config.get("branches", {})
allowed = branches_cfg.get("allowed", [])
alias = branches_cfg.get("alias", {})
candidates: list[str] = []
if isinstance(alias, dict):
candidates.extend(str(k) for k in alias.keys())
if isinstance(allowed, list):
candidates.extend(str(b) for b in allowed)
candidates = sorted(set(candidates), key=len, reverse=True)
raw_branch = None
for name in candidates:
if name and name in line:
raw_branch = name
break
if raw_branch is None:
return None, None
normalized = str(alias.get(raw_branch, raw_branch)) if isinstance(alias, dict) else raw_branch
return raw_branch, normalized
def match_status_heuristic(line: str, status: str) -> bool:
patterns = STATUS_HEURISTIC_PATTERNS.get(str(status).strip(), [])
for pattern in patterns:
if pattern.search(line):
return True
return False
def extract_status(line: str, config: dict[str, Any]) -> str:
# Normalize common wording variants before status matching.
status_line = str(line or "").replace("外行", "他行")
cfg = config.get("status_extraction", {})
rules = cfg.get("extraction_rules", {})
order = cfg.get("priority_order", [])
fallback = str(cfg.get("fallback", "成功营销"))
if not isinstance(rules, dict):
return fallback
if not isinstance(order, list) or not order:
order = [str(k) for k in rules.keys()]
for status in order:
keywords = rules.get(status, [status])
if not isinstance(keywords, list):
continue
for kw in keywords:
kw_text = str(kw)
if kw_text and kw_text in status_line:
return normalize_status_value(str(status), config)
valid_status = set(get_valid_status_list(config))
for status in order:
s = str(status).strip()
if not s:
continue
if valid_status and s not in valid_status:
continue
if match_status_heuristic(status_line, s):
return normalize_status_value(s, config)
# Heuristic: plain "提现" defaults to 微信提现 unless it clearly indicates 商户.
if "提现" in status_line:
if "商户" in status_line and "商户提现" in valid_status:
return "商户提现"
if "微信提现" in valid_status:
return "微信提现"
# Heuristic: "揽收他行40万存一年" 这类语句不能回退到“成功营销”。
if "他行" in status_line:
if "揽收他行" in valid_status:
return "揽收他行"
if "他行挖转" in valid_status:
return "他行挖转"
return normalize_status_value(fallback, config)
def normalize_chinese_number_text(text: str) -> str:
return (
str(text)
.strip()
.replace("", "")
.replace("", "")
.replace("", "")
.replace("", "")
)
def chinese_integer_to_float(text: str) -> float | None:
s = normalize_chinese_number_text(text)
if not s:
return None
total = 0
section = 0
number = 0
seen = False
for ch in s:
if ch in CHINESE_DIGIT_MAP:
number = CHINESE_DIGIT_MAP[ch]
seen = True
continue
unit = CHINESE_UNIT_MAP.get(ch)
if unit is None:
return None
seen = True
if unit in {10, 100, 1000}:
if number == 0:
number = 1
section += number * unit
number = 0
continue
section += number
if section == 0:
section = 1
total += section * unit
section = 0
number = 0
if not seen:
return None
return float(total + section + number)
def chinese_text_to_float(text: str) -> float | None:
s = normalize_chinese_number_text(text)
if not s:
return None
if "" in s:
int_part, frac_part = s.split("", 1)
if not frac_part:
return None
int_value = chinese_integer_to_float(int_part) if int_part else 0.0
if int_value is None:
return None
frac_digits: list[str] = []
for ch in frac_part:
digit = CHINESE_DIGIT_MAP.get(ch)
if digit is None:
return None
frac_digits.append(str(digit))
return float(f"{int(int_value)}.{''.join(frac_digits)}")
return chinese_integer_to_float(s)
def chinese_digit_fraction(text: str) -> float | None:
s = normalize_chinese_number_text(text)
if not s:
return None
if not all(ch in CHINESE_DIGIT_MAP for ch in s):
return None
digits = "".join(str(CHINESE_DIGIT_MAP[ch]) for ch in s)
return int(digits) / (10 ** len(digits))
def parse_wan_suffix_shorthand(text: str) -> float | None:
s = normalize_chinese_number_text(text)
if not s:
return 0.0
if any(ch in s for ch in ("", "", "", "", "亿")):
return None
if "" in s:
right_val = chinese_text_to_float(s)
if right_val is None:
return None
# e.g. "五点五" after "万" => 0.55 万
return right_val / 10.0
return chinese_digit_fraction(s)
def parse_chinese_amount_value(raw_text: str, right_1: str, right_2: str) -> float | None:
token = normalize_chinese_number_text(raw_text)
if not token:
return None
has_digit = any(ch in CHINESE_DIGIT_MAP for ch in token)
has_small_unit = any(ch in token for ch in ("", "", ""))
if not has_digit and not has_small_unit:
return None
has_wan = ("" in token) or (right_1 == "") or right_2.startswith("万元")
if has_wan:
if "" in token:
left, right = token.split("", 1)
left_text = left.strip()
right_text = right.strip()
left_value = chinese_text_to_float(left_text) if left_text else 0.0
if left_value is None:
return None
if not right_text:
return left_value
right_norm = normalize_chinese_number_text(right_text)
# Full unit expression (e.g. 两万五千、两万零五百) -> convert absolute value to "万".
if any(ch in right_norm for ch in ("", "", "", "", "亿")):
absolute = chinese_text_to_float(token)
if absolute is None:
return None
return absolute / 10000.0
# Colloquial shorthand (e.g. 两万五 -> 2.5万, 一万二 -> 1.2万).
shorthand = parse_wan_suffix_shorthand(right_norm)
if shorthand is not None:
return left_value + shorthand
right_value = chinese_text_to_float(right_norm)
if right_value is None:
return left_value
return left_value + (right_value / 10.0)
# e.g. "五 万" where token may be "五"
value = chinese_text_to_float(token)
return value
return chinese_text_to_float(token)
def extract_amount_candidates(text: str) -> list[dict[str, Any]]:
candidates: list[dict[str, Any]] = []
for m in re.finditer(r"\d+(?:\.\d+)?", text):
num_text = m.group(0)
start, end = m.span()
right_2 = text[end : end + 2]
right_1 = text[end : end + 1]
# Exclude time/date/tenor hints: 2月23日, 5年交, 6个月 ...
if right_1 in {"", "", "", ""}:
continue
if right_2.startswith("个月"):
continue
has_wan = False
if right_1 in {"", "W", "w"} or right_2.startswith("万元"):
has_wan = True
try:
value = float(num_text)
except ValueError:
continue
candidates.append(
{
"value": value,
"num": strip_trailing_zero(num_text),
"start": start,
"end": end,
"has_wan": has_wan,
}
)
for m in CHINESE_NUMBER_RE.finditer(text):
raw_text = m.group(0)
start, end = m.span()
right_2 = text[end : end + 2]
right_1 = text[end : end + 1]
if right_1 in {"", "", "", ""}:
continue
if right_2.startswith("个月"):
continue
# Avoid treating single Chinese digits in names as amount, e.g. "钱七/周八/吴九".
# Keep unit-bearing forms (e.g. "十万", "五万元") untouched.
if (
len(raw_text) == 1
and raw_text in CHINESE_DIGIT_MAP
and right_1 not in {"", ""}
and not right_2.startswith("万元")
):
continue
value = parse_chinese_amount_value(raw_text, right_1, right_2)
if value is None:
continue
has_wan = ("" in raw_text) or (right_1 == "") or right_2.startswith("万元")
candidates.append(
{
"value": value,
"num": strip_trailing_zero(str(value)),
"start": start,
"end": end,
"has_wan": has_wan,
}
)
return candidates
def amount_num_to_text(num_text: str) -> str:
v = floor_amount_number(num_text)
if v is None:
return ""
return f"{v}"
def pick_amount_from_candidates(
text: str,
candidates: list[dict[str, Any]],
anchor_pos: int | None = None,
) -> str | None:
if not candidates:
return None
has_wan_candidates = [c for c in candidates if bool(c.get("has_wan"))]
if anchor_pos is None:
# Prefer "合计/共计" amount, otherwise largest amount.
preferred = has_wan_candidates or candidates
for c in preferred:
near = text[max(0, c["start"] - 2) : c["start"] + 2]
if "合计" in near or "共计" in near:
return amount_num_to_text(c["num"])
best = max(preferred, key=lambda x: x["value"])
return amount_num_to_text(best["num"])
def score(c: dict[str, Any]) -> tuple[float, int]:
distance = abs(c["start"] - anchor_pos)
left = text[max(0, c["start"] - 3) : c["start"]]
if "合计" in left or "共计" in left:
distance -= 3
# Prefer value on the left side if equally close.
side = 0 if c["start"] <= anchor_pos else 1
return (distance, side)
preferred = has_wan_candidates or candidates
best = min(preferred, key=score)
return amount_num_to_text(best["num"])
def split_segments(text: str) -> list[str]:
parts = re.split(r"[,;。]", text)
return [p.strip() for p in parts if p.strip()]
def normalize_insurance_year(value: Any) -> str | None:
if value is None:
return None
s = str(value).strip()
if s in {"3", "3年", "3年交", "三年", "三年交"}:
return "3"
if s in {"5", "5年", "5年交", "五年", "五年交"}:
return "5"
return None
def normalize_insurance_year_choices(value: Any) -> dict[str, str]:
if not isinstance(value, dict):
return {}
out: dict[str, str] = {}
for k, v in value.items():
key = str(k).strip()
year = normalize_insurance_year(v)
if key and year:
out[key] = year
return out
def build_insurance_choice_key(
*,
source_line: str,
raw_text: str,
branch: str,
amount: str,
page: str,
item_index: int,
) -> str:
seed = f"{source_line}|{raw_text}|{branch}|{amount}|{page}|{item_index}"
return hashlib.sha1(seed.encode("utf-8")).hexdigest()[:16]
def pick_insurance_year_for_record(
*,
choice_key: str,
source_line: str,
raw_text: str,
branch: str,
amount: str,
year_map: dict[str, str],
default_year: str | None,
) -> str | None:
candidates = [
choice_key,
source_line,
raw_text,
f"{source_line}|{amount}",
f"{raw_text}|{amount}",
f"{branch}|{amount}",
]
for key in candidates:
if key and key in year_map:
y = normalize_insurance_year(year_map.get(key))
if y in {"3", "5"}:
return y
return normalize_insurance_year(default_year)
def make_product(type_keyword: str, page: str, amount: str | None, **kwargs: Any) -> dict[str, Any]:
item = {
"type": type_keyword,
"page": page,
"amount": amount,
}
item.update(kwargs)
return item
def detect_products(
line: str,
config: dict[str, Any],
insurance_year_choice: str | None,
) -> list[dict[str, Any]]:
insurance_cfg = config.get("insurance_handling", {})
insurance_page = str(insurance_cfg.get("page", "page_3")) if isinstance(insurance_cfg, dict) else "page_3"
products: list[dict[str, Any]] = []
seen: set[str] = set()
segments = split_segments(line)
if not segments:
segments = [line]
line_candidates = extract_amount_candidates(line)
seg_search_pos = 0
def locate_segment_start(seg_text: str) -> int:
nonlocal seg_search_pos
if not seg_text:
return 0
pos = line.find(seg_text, seg_search_pos)
if pos < 0:
pos = line.find(seg_text)
if pos >= 0:
seg_search_pos = pos + len(seg_text)
return pos
return 0
def pick_segment_amount(
seg_text: str,
seg_candidates: list[dict[str, Any]],
seg_anchor: int | None,
seg_start: int,
) -> str | None:
amount = pick_amount_from_candidates(seg_text, seg_candidates, seg_anchor)
if amount:
return amount
if not line_candidates:
return None
line_anchor = None
if isinstance(seg_anchor, int) and seg_anchor >= 0:
line_anchor = seg_start + seg_anchor
return pick_amount_from_candidates(line, line_candidates, line_anchor)
for seg in segments:
seg_candidates = extract_amount_candidates(seg)
seg_start = locate_segment_start(seg)
# 1) Insurance (always mapped to 期交 page)
if "保险" in seg or "期交" in seg or re.search(r"(?:3|5|三|五)\s*年交", seg):
year = None
if re.search(r"(?:3|三)\s*年交", seg):
year = "3"
elif re.search(r"(?:5|五)\s*年交", seg):
year = "5"
elif insurance_year_choice in {"3", "5"}:
year = insurance_year_choice
insurance_anchor = seg.find("保险")
if insurance_anchor < 0:
insurance_anchor = 0
amount = pick_segment_amount(seg, seg_candidates, insurance_anchor, seg_start)
if year == "3":
type_keyword = "3年交"
elif year == "5":
type_keyword = "5年交"
else:
type_keyword = "保险"
key = f"{type_keyword}|{insurance_page}|{amount}"
if key not in seen:
products.append(
make_product(
type_keyword,
insurance_page,
amount,
is_insurance=True,
needs_insurance_year=(year is None),
)
)
seen.add(key)
# 2) Wealth/Fund/Asset
if "理财" in seg:
amount = pick_segment_amount(seg, seg_candidates, seg.find("理财"), seg_start)
key = f"理财|page_5|{amount}"
if key not in seen:
products.append(make_product("理财", "page_5", amount))
seen.add(key)
if "基金" in seg:
amount = pick_segment_amount(seg, seg_candidates, seg.find("基金"), seg_start)
key = f"基金|page_7|{amount}"
if key not in seen:
products.append(make_product("基金", "page_7", amount))
seen.add(key)
if "资管" in seg:
amount = pick_segment_amount(seg, seg_candidates, seg.find("资管"), seg_start)
key = f"资管|page_6|{amount}"
if key not in seen:
products.append(make_product("资管", "page_6", amount))
seen.add(key)
# 3) Fixed-term deposit variants
for term in TERM_DEFS:
rgx = term["regex"]
for m in rgx.finditer(seg):
# Avoid interpreting "2年理财" as one-year term.
if term["page"] == "page_2" and "理财" in seg and "定期" not in seg and "" not in seg:
continue
amount = pick_segment_amount(seg, seg_candidates, m.start(), seg_start)
key = f"{term['type']}|{term['page']}|{amount}"
if key not in seen:
products.append(make_product(str(term["type"]), str(term["page"]), amount))
seen.add(key)
return products
def is_demand_deposit_only_line(line: str) -> bool:
# "活期"类关键词但没有明确期限3/6个月、半年、1年、一年默认不生成。
demand_markers = [
"微信提现",
"微信支付宝提现",
"支付宝提现",
"挖转",
"他行",
"揽收现金",
"存量提升",
"揽收商户",
"商户提现",
"揽收彩礼",
]
explicit_product_markers = [
"保险",
"年交",
"理财",
"基金",
"资管",
"趸交",
"三个月",
"3个月",
"六个月",
"6个月",
"半年",
"一年",
"1年",
]
if not extract_amount_candidates(line):
return False
if not any(x in line for x in demand_markers):
return False
if any(x in line for x in explicit_product_markers):
return False
return True
def key_for_record(record: dict[str, Any], key_fields: list[str]) -> str:
line_serial = str(record.get("line_serial", "")).strip()
if line_serial:
line_product_index = str(record.get("line_product_index", "")).strip() or "1"
sig = str(record.get("line_signature", "")).strip()
if not sig:
sig = line_signature(str(record.get("raw_text", "")))
parts = [f"line:{line_serial}", f"item:{line_product_index}"]
if sig:
parts.append(f"sig:{sig}")
return "|".join(parts)
return "|".join(str(record.get(k, "")).strip() for k in key_fields)
def signature_key_for_record(record: dict[str, Any]) -> str:
line_product_index = str(record.get("line_product_index", "")).strip() or "1"
sig = str(record.get("line_signature", "")).strip()
if not sig:
raw_text = str(record.get("raw_text", "")).strip()
if raw_text:
sig = line_signature(raw_text)
else:
source_line = str(record.get("source_line", "")).strip()
if source_line:
normalized_source = normalize_line(source_line, r"^\s*\d+\s*(?:[、,.\)]\s*)?")
sig = line_signature(normalized_source)
if not sig:
return ""
return f"sig:{sig}|item:{line_product_index}"
def render_output_filename(config: dict[str, Any], record: dict[str, Any], index: int) -> str:
settings = config.get("output_settings", {})
pattern = "喜报_{branch}_{index}.png"
if isinstance(settings, dict):
pattern = str(settings.get("output_pattern", pattern))
try:
name = pattern.format(branch=record.get("branch", "未知网点"), index=index)
except Exception:
name = f"喜报_{record.get('branch', '未知网点')}_{index}.png"
if not name.lower().endswith(".png"):
name = f"{name}.png"
return name
def normalize_branch_value(branch: Any, config: dict[str, Any]) -> str:
b = str(branch).strip()
alias = config.get("branches", {}).get("alias", {})
if isinstance(alias, dict) and b in alias:
return str(alias.get(b, b)).strip()
return b
def infer_page_from_type(type_keyword: str, config: dict[str, Any]) -> str:
t = str(type_keyword).strip()
if not t:
return ""
type_map = config.get("type_matching", {})
if isinstance(type_map, dict):
direct = str(type_map.get(t, "")).strip()
if direct:
return direct
pairs = [(str(k), str(v)) for k, v in type_map.items()]
pairs.sort(key=lambda x: len(x[0]), reverse=True)
for k, v in pairs:
if k and k in t and v:
return v.strip()
for term in TERM_DEFS:
if t == str(term.get("type", "")):
return str(term.get("page", "")).strip()
return ""
def apply_record_overrides(
record: dict[str, Any],
updates: dict[str, Any],
config: dict[str, Any],
) -> dict[str, Any]:
out = dict(record)
if not isinstance(updates, dict):
return out
if "branch" in updates:
branch = normalize_branch_value(updates.get("branch", ""), config)
if branch:
out["branch"] = branch
if "amount" in updates:
amount = normalize_amount_text(updates.get("amount", ""))
if amount:
out["amount"] = amount
if "type" in updates:
t = str(updates.get("type", "")).strip()
if t:
out["type"] = t
if "page" not in updates:
page = infer_page_from_type(t, config)
if page:
out["page"] = page
if "page" in updates:
page = str(updates.get("page", "")).strip()
if page:
out["page"] = page
if "status" in updates:
status = str(updates.get("status", "")).strip()
if status:
out["status"] = normalize_status_value(status, config)
return out
def validate_record_for_generation(record: dict[str, Any], config: dict[str, Any]) -> None:
branch = str(record.get("branch", "")).strip()
amount = normalize_amount_text(record.get("amount", ""))
type_keyword = str(record.get("type", "")).strip()
page = str(record.get("page", "")).strip()
status = str(record.get("status", "")).strip()
if not branch:
raise ValueError("branch is required")
allowed = config.get("branches", {}).get("allowed", [])
if isinstance(allowed, list) and allowed:
allow_set = {str(x).strip() for x in allowed}
if branch not in allow_set:
raise ValueError(f"branch not allowed: {branch}")
if not amount:
raise ValueError("amount is required")
record["amount"] = amount
if not type_keyword:
raise ValueError("type is required")
if not page:
page = infer_page_from_type(type_keyword, config)
if page:
record["page"] = page
page = str(record.get("page", "")).strip()
if not page:
raise ValueError("page is required")
pages = config.get("pages", {})
if isinstance(pages, dict) and page not in pages:
raise ValueError(f"invalid page: {page}")
if not status:
status = str(config.get("status_extraction", {}).get("fallback", "成功营销")).strip() or "成功营销"
valid_status = get_valid_status_list(config)
if valid_status and status not in valid_status:
raise ValueError(f"invalid status: {status}")
record["status"] = normalize_status_value(status, config)
def match_manual_rule(rule: dict[str, Any], source_line: str, normalized_line: str) -> bool:
if not isinstance(rule, dict) or not bool(rule.get("enabled", True)):
return False
keyword = str(rule.get("keyword", "")).strip()
if not keyword:
return False
mode = str(rule.get("match_mode", "normalized")).strip().lower()
if mode == "source":
return keyword in source_line
if mode == "both":
return keyword in source_line or keyword in normalized_line
return keyword in normalized_line
def apply_manual_rules_to_record(
record: dict[str, Any],
*,
source_line: str,
normalized_line: str,
rules: list[dict[str, Any]],
config: dict[str, Any],
) -> tuple[dict[str, Any], list[str]]:
if not rules:
return record, []
out = dict(record)
hit_ids: list[str] = []
for rule in rules:
if not match_manual_rule(rule, source_line, normalized_line):
continue
updates = rule.get("updates", {})
if not isinstance(updates, dict):
continue
try:
out = apply_record_overrides(out, updates, config)
except Exception:
continue
hit_ids.append(str(rule.get("id", "")))
return out, [x for x in hit_ids if x]
def save_or_update_manual_rule(
*,
keyword: str,
updates: dict[str, Any],
note: str = "",
match_mode: str = "normalized",
) -> dict[str, Any]:
kw = re.sub(r"\s+", "", str(keyword).strip())
if not kw:
raise ValueError("rule keyword is required")
if not isinstance(updates, dict) or not updates:
raise ValueError("rule updates is required")
normalized_updates: dict[str, Any] = {}
for k in ("branch", "type", "page", "status", "amount"):
if k in updates:
v = str(updates.get(k, "")).strip()
if v:
normalized_updates[k] = v
if not normalized_updates:
raise ValueError("rule updates is empty")
mode = str(match_mode or "normalized").strip().lower()
if mode not in {"source", "normalized", "both"}:
mode = "normalized"
rules = load_manual_rules()
now = datetime.now().isoformat(timespec="seconds")
for rule in rules:
if not isinstance(rule, dict):
continue
if str(rule.get("keyword", "")).strip() != kw:
continue
if str(rule.get("match_mode", "normalized")).strip().lower() != mode:
continue
if rule.get("updates") != normalized_updates:
continue
rule["updated_at"] = now
if note:
rule["note"] = note
save_manual_rules(rules)
return dict(rule)
item = {
"id": uuid.uuid4().hex[:12],
"keyword": kw,
"match_mode": mode,
"updates": normalized_updates,
"enabled": True,
"created_at": now,
"updated_at": now,
}
if note:
item["note"] = note
rules.append(item)
save_manual_rules(rules)
return dict(item)
def infer_correction_rule_keyword(
*,
source_line: str,
normalized_line: str,
corrected_record: dict[str, Any],
) -> str:
status = str(corrected_record.get("status", "")).strip()
if status and (status in normalized_line or status in source_line):
return status
t = str(corrected_record.get("type", "")).strip()
if t and (t in normalized_line or t in source_line) and len(t) <= 8:
return t
if normalized_line:
return normalized_line[:24]
return re.sub(r"\s+", "", source_line)[:24]
def parse_records(
raw_text: str,
config: dict[str, Any],
history: list[dict[str, Any]],
insurance_year_choice: str | None = None,
insurance_year_choices: dict[str, str] | None = None,
) -> dict[str, Any]:
relay_cfg = config.get("relay_handling", {})
parse_rules = relay_cfg.get("parse_rules", {}) if isinstance(relay_cfg, dict) else {}
trigger = str(relay_cfg.get("trigger_keyword", "#接龙"))
line_pattern = str(parse_rules.get("line_pattern", r"^\d+、\s*"))
skip_lines = parse_rules.get("skip_lines", [])
skip_lines = [str(x) for x in skip_lines] if isinstance(skip_lines, list) else []
has_trigger = trigger in raw_text
lines = [line.strip() for line in raw_text.splitlines() if line.strip()]
filtered_lines: list[dict[str, str]] = []
skipped: list[dict[str, str]] = []
for line in lines:
source_line = line.strip()
if trigger and trigger in source_line and source_line == trigger:
continue
normalized = normalize_line(source_line, line_pattern)
if not normalized:
continue
if is_date_header_line(normalized):
skipped.append({"line": source_line, "reason": "skip_line_rule"})
continue
if any(token and token in normalized for token in skip_lines):
skipped.append({"line": source_line, "reason": "skip_line_rule"})
continue
filtered_lines.append(
{
"source_line": source_line,
"normalized_line": normalized,
"line_serial": extract_line_serial(source_line, line_pattern),
}
)
if not filtered_lines and raw_text.strip():
source_line = raw_text.strip()
normalized_single = normalize_line(source_line, line_pattern)
if normalized_single:
filtered_lines = [
{
"source_line": source_line,
"normalized_line": normalized_single,
"line_serial": extract_line_serial(source_line, line_pattern),
}
]
branches_cfg = config.get("branches", {})
allowed_branches = set(str(x) for x in branches_cfg.get("allowed", [])) if isinstance(branches_cfg, dict) else set()
dedup_cfg = relay_cfg.get("dedup", {}) if isinstance(relay_cfg, dict) else {}
key_fields = dedup_cfg.get("key_fields", ["branch", "amount", "type"])
if not isinstance(key_fields, list) or not key_fields:
key_fields = ["branch", "amount", "type"]
key_fields = [str(k) for k in key_fields]
insurance_year_map = dict(insurance_year_choices) if isinstance(insurance_year_choices, dict) else {}
history_lookup = build_history_lookup(history, key_fields)
history_keys = set(history_lookup.keys())
history_signature_lookup = build_history_signature_lookup(history)
history_signature_keys = set(history_signature_lookup.keys())
batch_keys: set[str] = set()
batch_signature_keys: set[str] = set()
suppressed_lookup = build_skip_suppression_lookup(list_skip_suppressions())
manual_rules = load_manual_rules()
records: list[dict[str, Any]] = []
new_records: list[dict[str, Any]] = []
pending_insurance_records: list[dict[str, Any]] = []
duplicate_records: list[dict[str, Any]] = []
for line_item in filtered_lines:
source_line = str(line_item.get("source_line", ""))
normalized_line = str(line_item.get("normalized_line", ""))
line_serial = str(line_item.get("line_serial", "")).strip()
normalized_signature = line_signature(normalized_line)
branch_raw, branch = extract_branch(normalized_line, config)
if not branch:
skipped.append({"line": source_line, "reason": "branch_not_found"})
continue
if allowed_branches and branch not in allowed_branches:
skipped.append({"line": source_line, "reason": f"branch_not_allowed:{branch}"})
continue
status = normalize_status_value(extract_status(normalized_line, config), config)
products = detect_products(normalized_line, config, insurance_year_choice)
if not products:
if is_demand_deposit_only_line(normalized_line):
skipped.append({"line": source_line, "reason": "demand_deposit_not_generate"})
else:
skipped.append({"line": source_line, "reason": "type_not_found"})
continue
for product_index, product in enumerate(products, start=1):
amount = str(product.get("amount", "")).strip()
if not amount:
skipped.append({"line": source_line, "reason": "amount_not_found"})
continue
type_keyword = str(product.get("type", "")).strip()
page = str(product.get("page", "")).strip()
if not type_keyword or not page:
skipped.append({"line": source_line, "reason": "type_not_found"})
continue
record = {
"source_line": source_line,
"raw_text": normalized_line,
"branch_raw": branch_raw,
"branch": branch,
"amount": amount,
"type": type_keyword,
"page": page,
"status": status,
"needs_insurance_year": bool(product.get("needs_insurance_year", False)),
}
if line_serial:
record["line_serial"] = line_serial
if normalized_signature:
record["line_signature"] = normalized_signature
record["line_product_index"] = str(product_index)
if record["needs_insurance_year"]:
choice_key = build_insurance_choice_key(
source_line=source_line,
raw_text=normalized_line,
branch=branch,
amount=amount,
page=page,
item_index=product_index,
)
record["insurance_choice_key"] = choice_key
selected_year = pick_insurance_year_for_record(
choice_key=choice_key,
source_line=source_line,
raw_text=normalized_line,
branch=branch,
amount=amount,
year_map=insurance_year_map,
default_year=insurance_year_choice,
)
if selected_year in {"3", "5"}:
record["type"] = f"{selected_year}年交"
record["needs_insurance_year"] = False
record["insurance_year"] = selected_year
record, hit_rule_ids = apply_manual_rules_to_record(
record,
source_line=source_line,
normalized_line=normalized_line,
rules=manual_rules,
config=config,
)
record["amount"] = normalize_amount_text(record.get("amount", ""))
record["status"] = normalize_status_value(str(record.get("status", "")), config)
if hit_rule_ids:
record["applied_rule_ids"] = hit_rule_ids
if record["needs_insurance_year"]:
forced_year = normalize_insurance_year(record.get("type"))
if forced_year in {"3", "5"}:
record["type"] = f"{forced_year}年交"
record["needs_insurance_year"] = False
record["insurance_year"] = forced_year
if not str(record.get("amount", "")).strip():
skipped.append({"line": source_line, "reason": "amount_not_found"})
continue
if not str(record.get("page", "")).strip():
inferred_page = infer_page_from_type(str(record.get("type", "")), config)
if inferred_page:
record["page"] = inferred_page
pages = config.get("pages", {})
if isinstance(pages, dict) and str(record.get("page", "")).strip() not in pages:
skipped.append({"line": source_line, "reason": "type_not_found"})
continue
if allowed_branches and str(record.get("branch", "")).strip() not in allowed_branches:
skipped.append({"line": source_line, "reason": f"branch_not_allowed:{record.get('branch', '')}"})
continue
records.append(record)
dedup_key = key_for_record(record, key_fields)
record["dedup_key"] = dedup_key
signature_key = signature_key_for_record(record)
if signature_key:
record["signature_key"] = signature_key
history_record = history_lookup.get(dedup_key)
if history_record is None and signature_key:
history_record = history_signature_lookup.get(signature_key)
if history_record is not None or dedup_key in history_keys or (signature_key and signature_key in history_signature_keys):
dup = dict(record)
dup["duplicate_reason"] = "history_duplicate"
attach_duplicate_download_info(dup, history_record, config)
duplicate_records.append(dup)
continue
if dedup_key in batch_keys or (signature_key and signature_key in batch_signature_keys):
dup = dict(record)
dup["duplicate_reason"] = "input_duplicate"
duplicate_records.append(dup)
continue
# Insurance without explicit 3/5 year should ask user only when truly pending.
# If this line was manually suppressed in skip rules, keep it out of prompt queue.
if record["needs_insurance_year"]:
if is_skip_item_suppressed(source_line, "insurance_year_pending", suppressed_lookup):
batch_keys.add(dedup_key)
if signature_key:
batch_signature_keys.add(signature_key)
continue
pending = dict(record)
pending["insurance_options"] = ["3", "5"]
pending_insurance_records.append(pending)
batch_keys.add(dedup_key)
if signature_key:
batch_signature_keys.add(signature_key)
continue
batch_keys.add(dedup_key)
if signature_key:
batch_signature_keys.add(signature_key)
new_records.append(record)
for i, rec in enumerate(new_records, start=1):
rec["output_file"] = render_output_filename(config, rec, i)
visible_skipped: list[dict[str, str]] = []
for item in skipped:
line_text = str(item.get("line", ""))
reason_text = str(item.get("reason", ""))
if is_skip_item_suppressed(line_text, reason_text, suppressed_lookup):
continue
visible_skipped.append(item)
summary = {
"input_lines": len(lines),
"parsed": len(records),
"new": len(new_records),
"duplicate": len(duplicate_records),
"skipped": len(visible_skipped),
"insurance_pending": len(pending_insurance_records),
}
return {
"has_trigger": has_trigger,
"records": records,
"new_records": new_records,
"pending_insurance_records": pending_insurance_records,
"duplicate_records": duplicate_records,
"skipped": visible_skipped,
"summary": summary,
"dedup_key_fields": key_fields,
"needs_insurance_choice": len(pending_insurance_records) > 0,
}
def append_new_history(
history_path: Path,
current_history: list[dict[str, Any]],
records: list[dict[str, Any]],
key_fields: list[str],
) -> dict[str, Any]:
existing = {key_for_record(r, key_fields) for r in current_history}
to_add = []
now = datetime.now().isoformat(timespec="seconds")
for rec in records:
if not isinstance(rec, dict):
continue
k = key_for_record(rec, key_fields)
if not k or k in existing:
continue
item = dict(rec)
item.pop("download_token", None)
item.pop("download_url", None)
item["created_at"] = now
to_add.append(item)
existing.add(k)
merged = current_history + to_add
save_history(history_path, merged)
return {
"added": len(to_add),
"total": len(merged),
}
def upsert_history_records(
history_path: Path,
current_history: list[dict[str, Any]],
records: list[dict[str, Any]],
key_fields: list[str],
) -> dict[str, Any]:
merged = [x for x in current_history if isinstance(x, dict)]
index_map: dict[str, int] = {}
for i, item in enumerate(merged):
k = key_for_record(item, key_fields)
if k:
index_map[k] = i
now = datetime.now().isoformat(timespec="seconds")
added = 0
replaced = 0
for rec in records:
if not isinstance(rec, dict):
continue
item = dict(rec)
item.pop("download_token", None)
item.pop("download_url", None)
item["created_at"] = now
k = key_for_record(item, key_fields)
if not k:
continue
if k in index_map:
merged[index_map[k]] = item
replaced += 1
else:
index_map[k] = len(merged)
merged.append(item)
added += 1
save_history(history_path, merged)
return {
"added": added,
"replaced": replaced,
"total": len(merged),
}
def page_to_slide_index(page_name: str) -> int:
m = re.match(r"^page_(\d+)$", page_name)
if not m:
raise ValueError(f"invalid page name: {page_name}")
return int(m.group(1))
def safe_filename(name: str) -> str:
s = re.sub(r"[\\/:*?\"<>|\r\n]+", "_", name).strip()
s = re.sub(r"\s+", "_", s)
return s or f"file_{uuid.uuid4().hex[:8]}"
def amount_to_digits(amount: str) -> str:
v = floor_amount_number(amount)
if v is None:
t = str(amount).strip()
return t.replace("万元", "").replace("", "")
return str(v)
def pick_paragraph_index(paragraph_count: int, configured_index: int) -> int:
if paragraph_count <= 0:
return 0
if 0 <= configured_index < paragraph_count:
return configured_index
# Compatible with 1-based config values.
one_based = configured_index - 1
if 0 <= one_based < paragraph_count:
return one_based
return 0
def paragraph_plain_text(para: Any) -> str:
if getattr(para, "runs", None):
return "".join((run.text or "") for run in para.runs)
return str(getattr(para, "text", "") or "")
def set_paragraph_text(para: Any, text: str) -> None:
if getattr(para, "runs", None):
para.runs[0].text = text
for run in para.runs[1:]:
run.text = ""
return
para.text = text
def paragraph_has_branch_marker(para: Any) -> bool:
txt = paragraph_plain_text(para).strip()
return bool(txt) and ("营业所" in txt)
def replace_company_placeholder_text(text: str, company_replacement: str) -> str:
company_text = str(company_replacement or "").strip()
if not company_text:
return str(text or "")
return re.sub(r"([Xx*]+)\s*(?=分公司)", company_text, str(text or ""), count=1)
def build_branch_text(
original_text: str,
branch: str,
fallback_text: str,
company_replacement: str = "",
) -> str:
text = str(original_text or "")
if not text.strip():
return fallback_text
# Some templates use placeholder tokens like "XX/**" before "分公司".
# Replace them first to avoid rendering "XX分公司".
text = replace_company_placeholder_text(text, company_replacement)
# Keep the template prefix (e.g. "永州市道县分公司"), only replace the branch part.
m = re.search(r"(分公司)([^|,。,;\s]*)营业所", text)
if m:
return f"{text[:m.start(2)]}{branch}营业所{text[m.end():]}"
m = re.search(r"([^|,。,;\s]*)营业所", text)
if m:
return f"{text[:m.start(1)]}{branch}营业所{text[m.end():]}"
return fallback_text
def replace_branch_in_slide(slide: Any, branch: str, config: dict[str, Any]) -> None:
branch_cfg = config.get("replace_algorithm", {}).get("branch", {})
location_cfg = config.get("branches", {}).get("template_location", {})
shape_index = int(branch_cfg.get("shape_index", location_cfg.get("shape_index", 3)))
paragraph_index = int(branch_cfg.get("paragraph_index", location_cfg.get("paragraph_index", 1)))
fmt = str(branch_cfg.get("format", location_cfg.get("display_format", "{branch}营业所")))
company_replacement = str(
branch_cfg.get("company_replacement", location_cfg.get("company_replacement", ""))
).strip()
new_text = fmt.format(branch=branch)
target_para = None
fallback_para = None
target_shape = None
cfg_paragraphs = []
if 0 <= shape_index < len(slide.shapes):
cfg_shape = slide.shapes[shape_index]
if getattr(cfg_shape, "has_text_frame", False):
cfg_paragraphs = cfg_shape.text_frame.paragraphs
if cfg_paragraphs:
p_index = pick_paragraph_index(len(cfg_paragraphs), paragraph_index)
fallback_para = cfg_paragraphs[p_index]
if paragraph_has_branch_marker(cfg_paragraphs[p_index]):
target_para = cfg_paragraphs[p_index]
target_shape = cfg_shape
else:
for para in cfg_paragraphs:
if paragraph_has_branch_marker(para):
target_para = para
target_shape = cfg_shape
break
if target_para is None:
for shape in slide.shapes:
if not getattr(shape, "has_text_frame", False):
continue
paragraphs = shape.text_frame.paragraphs
for para in paragraphs:
if paragraph_has_branch_marker(para):
target_para = para
target_shape = shape
break
if target_para is not None:
break
if target_para is not None:
original_text = paragraph_plain_text(target_para)
set_paragraph_text(
target_para,
build_branch_text(original_text, branch, new_text, company_replacement),
)
if company_replacement and target_shape is not None and getattr(target_shape, "has_text_frame", False):
for para in target_shape.text_frame.paragraphs:
old_text = paragraph_plain_text(para)
new_company_text = replace_company_placeholder_text(old_text, company_replacement)
if new_company_text != old_text:
set_paragraph_text(para, new_company_text)
return
if fallback_para is not None:
set_paragraph_text(fallback_para, new_text)
def replace_amount_in_slide(slide: Any, shape_index: int, amount: str) -> None:
if shape_index < 0 or shape_index >= len(slide.shapes):
return
shape = slide.shapes[shape_index]
if not getattr(shape, "has_text_frame", False):
return
amount_digits = amount_to_digits(amount)
candidates: list[tuple[Any, int, str]] = []
paragraphs = shape.text_frame.paragraphs
for para in paragraphs:
runs = list(getattr(para, "runs", []) or [])
for idx, run in enumerate(runs):
t = str(getattr(run, "text", "") or "").strip()
if re.fullmatch(r"\d+(?:\.\d+)?", t) or re.fullmatch(r"[\*]+", t):
candidates.append((para, idx, t))
if not candidates:
return
def next_nonempty_text(para: Any, start_idx: int) -> str:
runs = list(getattr(para, "runs", []) or [])
for j in range(start_idx + 1, len(runs)):
txt = str(getattr(runs[j], "text", "") or "").strip()
if txt:
return txt
return ""
def prev_nonempty_text(para: Any, start_idx: int) -> str:
runs = list(getattr(para, "runs", []) or [])
for j in range(start_idx - 1, -1, -1):
txt = str(getattr(runs[j], "text", "") or "").strip()
if txt:
return txt
return ""
preferred: list[tuple[Any, int, str]] = []
for para, idx, _ in candidates:
nxt = next_nonempty_text(para, idx)
prv = prev_nonempty_text(para, idx)
# Prefer the amount token that is adjacent to "万/万元".
if ("" in nxt) or ("万元" in nxt) or ("" in prv) or ("万元" in prv):
preferred.append((para, idx, ""))
targets = preferred if preferred else [candidates[-1]]
for para, idx, _ in targets:
runs = list(getattr(para, "runs", []) or [])
if 0 <= idx < len(runs):
runs[idx].text = amount_digits
def replace_status_in_slide(
slide: Any,
shape_index: int,
status: str,
page_name: str,
config: dict[str, Any],
) -> None:
if shape_index < 0 or shape_index >= len(slide.shapes):
return
shape = slide.shapes[shape_index]
if not getattr(shape, "has_text_frame", False):
return
normalized_status = normalize_status_value(status, config)
page_cfg = config.get("pages", {}).get(page_name, {})
has_bracket = bool(page_cfg.get("has_bracket", False)) if isinstance(page_cfg, dict) else False
target_text = f"{normalized_status}" if has_bracket else normalized_status
valid_status = get_valid_status_list(config)
paragraphs = shape.text_frame.paragraphs
if not paragraphs:
shape.text_frame.text = target_text
return
target_para = None
for para in paragraphs:
para_text = "".join(run.text for run in para.runs).strip() if para.runs else para.text.strip()
if not para_text:
continue
if any(s in para_text for s in valid_status) or ("成功营销" in para_text) or ("" in para_text and "" in para_text):
target_para = para
break
if target_para is None:
for para in paragraphs:
para_text = "".join(run.text for run in para.runs).strip() if para.runs else para.text.strip()
if para_text:
target_para = para
break
if target_para is None:
target_para = paragraphs[0]
if target_para.runs:
replaced = False
# 优先替换已存在的状态 run保持模板字体/字号完全不变。
for run in target_para.runs:
raw = run.text or ""
trimmed = raw.strip()
if not raw:
continue
if trimmed in valid_status or trimmed == "成功营销":
run.text = raw.replace(trimmed, normalized_status, 1)
replaced = True
break
if not replaced and has_bracket:
for run in target_para.runs:
raw = run.text or ""
if "" in raw and "" in raw:
run.text = re.sub(r"[^]*", f"{normalized_status}", raw, count=1)
replaced = True
break
if not replaced:
target_para.runs[0].text = target_text
for run in target_para.runs[1:]:
run.text = ""
else:
target_para.text = target_text
if has_bracket:
target_para.alignment = PP_ALIGN.CENTER
shape.text_frame.word_wrap = False
def replace_page_display_name_in_slide(slide: Any, page_name: str, config: dict[str, Any]) -> None:
pages = config.get("pages", {})
if not isinstance(pages, dict):
return
page_cfg = pages.get(page_name, {})
if not isinstance(page_cfg, dict):
return
display_name = str(page_cfg.get("display_name", "")).strip()
if not display_name:
return
aliases = page_cfg.get("display_aliases", [])
alias_list: list[str] = [display_name]
if isinstance(aliases, list):
alias_list.extend([str(x).strip() for x in aliases if str(x).strip()])
if page_name == "page_5":
# 兼容旧模板文案。
alias_list.extend(["两三期理财", "两三年期理财"])
alias_list = [x for x in dict.fromkeys(alias_list) if x]
for shape in slide.shapes:
if not getattr(shape, "has_text_frame", False):
continue
paragraphs = shape.text_frame.paragraphs
for para in paragraphs:
raw_text = paragraph_plain_text(para)
compact_text = normalize_skip_line(raw_text)
if not compact_text:
continue
for alias in alias_list:
alias_compact = normalize_skip_line(alias)
if not alias_compact or alias_compact not in compact_text:
continue
if compact_text == alias_compact:
set_paragraph_text(para, display_name)
elif alias in raw_text:
set_paragraph_text(para, raw_text.replace(alias, display_name, 1))
elif len(compact_text) <= 24:
set_paragraph_text(para, compact_text.replace(alias_compact, display_name, 1))
else:
continue
# 避免“多一个字后自动折行”。
shape.text_frame.word_wrap = False
break
def keep_only_target_slide(prs: Presentation, keep_index: int) -> bool:
try:
sld_id_list = prs.slides._sldIdLst # type: ignore[attr-defined]
total = len(prs.slides)
if keep_index < 0 or keep_index >= total:
return False
for idx in range(total - 1, -1, -1):
if idx == keep_index:
continue
rel_id = sld_id_list[idx].rId
prs.part.drop_rel(rel_id)
del sld_id_list[idx]
return True
except Exception:
return False
def is_single_slide_output_enabled(config: dict[str, Any]) -> bool:
perf_cfg = config.get("performance", {})
if isinstance(perf_cfg, dict):
return bool(perf_cfg.get("single_slide_output", True))
return True
def resolve_generation_strategy(config: dict[str, Any], total_records: int) -> str:
perf_cfg = config.get("performance", {})
if not isinstance(perf_cfg, dict):
return "legacy"
strategy = str(perf_cfg.get("generation_strategy", "page_template_cache")).strip().lower()
if strategy not in {"legacy", "page_template_cache"}:
strategy = "page_template_cache"
if strategy == "legacy":
return "legacy"
min_records = perf_cfg.get("template_cache_min_records", 2)
try:
min_records_int = int(min_records)
except Exception:
min_records_int = 2
if total_records < max(1, min_records_int):
return "legacy"
if not is_single_slide_output_enabled(config):
return "legacy"
return "page_template_cache"
def _resolve_perf_section(config: dict[str, Any]) -> dict[str, Any]:
perf_cfg = config.get("performance", {})
return perf_cfg if isinstance(perf_cfg, dict) else {}
def resolve_single_generation_mode(config: dict[str, Any]) -> bool:
perf_cfg = _resolve_perf_section(config)
return bool(perf_cfg.get("single_generation_mode", True))
def resolve_build_workers(config: dict[str, Any], total_records: int) -> int:
perf_cfg = _resolve_perf_section(config)
raw_workers = perf_cfg.get("max_build_workers", 1)
try:
workers = int(raw_workers)
except Exception:
workers = 1
workers = max(1, min(6, workers))
return max(1, min(workers, total_records, (os.cpu_count() or 2)))
def resolve_extract_workers(config: dict[str, Any]) -> int:
perf_cfg = _resolve_perf_section(config)
raw_workers = perf_cfg.get("max_extract_workers", 1)
try:
workers = int(raw_workers)
except Exception:
workers = 1
return max(1, min(6, workers, (os.cpu_count() or 2)))
def resolve_memory_reclaim_options(config: dict[str, Any]) -> dict[str, bool]:
perf_cfg = _resolve_perf_section(config)
reclaim_cfg = perf_cfg.get("memory_reclaim", {})
if not isinstance(reclaim_cfg, dict):
reclaim_cfg = {}
enabled = bool(reclaim_cfg.get("enabled", True))
return {
"enabled": enabled,
"gc_collect": enabled and bool(reclaim_cfg.get("gc_collect", True)),
"malloc_trim": enabled and bool(reclaim_cfg.get("malloc_trim", True)),
}
def _try_malloc_trim() -> bool:
global _MALLOC_TRIM_FN
if os.name != "posix":
return False
try:
if _MALLOC_TRIM_FN is None:
libc = ctypes.CDLL("libc.so.6")
fn = libc.malloc_trim
fn.argtypes = [ctypes.c_size_t]
fn.restype = ctypes.c_int
_MALLOC_TRIM_FN = fn
return bool(_MALLOC_TRIM_FN(0))
except Exception:
return False
def reclaim_runtime_memory(config: dict[str, Any]) -> None:
opts = resolve_memory_reclaim_options(config)
if not opts["enabled"]:
return
if opts["gc_collect"]:
gc.collect()
if opts["malloc_trim"]:
_try_malloc_trim()
def build_page_template_cache(
template_path: Path,
page_names: set[str],
) -> dict[str, bytes]:
cache: dict[str, bytes] = {}
for page_name in sorted(page_names):
prs = Presentation(str(template_path))
slide_index = page_to_slide_index(page_name)
if slide_index < 0 or slide_index >= len(prs.slides):
raise RuntimeError(f"slide index out of range for page={page_name}")
if not keep_only_target_slide(prs, slide_index):
raise RuntimeError(f"failed to prepare single-slide template for page={page_name}")
buffer = io.BytesIO()
prs.save(buffer)
cache[page_name] = buffer.getvalue()
return cache
def resolve_image_delivery_options(config: dict[str, Any]) -> dict[str, Any]:
perf_cfg = config.get("performance", {})
delivery_cfg = perf_cfg.get("image_delivery", {}) if isinstance(perf_cfg, dict) else {}
if not isinstance(delivery_cfg, dict):
delivery_cfg = {}
enabled = bool(delivery_cfg.get("enabled", True))
raw_max_kbps = delivery_cfg.get("max_kbps", 300)
try:
max_kbps = int(raw_max_kbps)
except Exception:
max_kbps = 300
max_kbps = max(0, max_kbps)
raw_chunk_kb = delivery_cfg.get("chunk_kb", 16)
try:
chunk_kb = int(raw_chunk_kb)
except Exception:
chunk_kb = 16
chunk_kb = max(4, min(256, chunk_kb))
if not enabled:
max_kbps = 0
return {
"enabled": enabled,
"max_kbps": max_kbps,
"chunk_size": chunk_kb * 1024,
}
def build_ppt_for_record(
template_path: Path,
config: dict[str, Any],
record: dict[str, Any],
out_pptx: Path,
page_template_cache: dict[str, bytes] | None = None,
) -> int:
page_name = str(record.get("page", ""))
cached_payload = page_template_cache.get(page_name) if isinstance(page_template_cache, dict) else None
if cached_payload:
prs = Presentation(io.BytesIO(cached_payload))
slide_index = 0
else:
prs = Presentation(str(template_path))
slide_index = page_to_slide_index(page_name)
if slide_index < 0 or slide_index >= len(prs.slides):
raise RuntimeError(f"slide index out of range for page={record.get('page')}")
slide = prs.slides[slide_index]
page_cfg = config.get("pages", {}).get(page_name, {})
amount_shape = int(page_cfg.get("amount_shape", 2))
status_shape = int(page_cfg.get("status_shape", 3))
normalized_status = normalize_status_value(str(record.get("status", "")), config)
replace_branch_in_slide(slide, str(record.get("branch", "")), config)
replace_amount_in_slide(slide, amount_shape, str(record.get("amount", "")))
replace_status_in_slide(slide, status_shape, normalized_status, page_name, config)
replace_page_display_name_in_slide(slide, page_name, config)
single_slide_output = is_single_slide_output_enabled(config)
export_page_number = slide_index + 1
if not cached_payload and single_slide_output and keep_only_target_slide(prs, slide_index):
export_page_number = 1
prs.save(str(out_pptx))
return export_page_number
def run_subprocess(args: list[str], timeout: int = 300) -> subprocess.CompletedProcess[str]:
return subprocess.run(
args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
check=False,
timeout=timeout,
)
def use_local_office_converter() -> bool:
return bool(LOCAL_LIBREOFFICE_BIN and Path(str(LOCAL_LIBREOFFICE_BIN)).exists())
def use_local_pdftoppm() -> bool:
return bool(LOCAL_PDFTOPPM_BIN and Path(str(LOCAL_PDFTOPPM_BIN)).exists())
def ensure_converter_image() -> None:
global CONVERTER_IMAGE_READY
if use_local_office_converter():
CONVERTER_IMAGE_READY = True
return
if CONVERTER_IMAGE_READY:
return
with _CONVERTER_LOCK:
if CONVERTER_IMAGE_READY:
return
inspect = run_subprocess(["docker", "image", "inspect", CONVERTER_IMAGE], timeout=60)
if inspect.returncode != 0:
pull = run_subprocess(["docker", "pull", CONVERTER_IMAGE], timeout=900)
if pull.returncode != 0:
raise RuntimeError(
f"failed to pull docker image {CONVERTER_IMAGE}: {pull.stderr.strip() or pull.stdout.strip()}"
)
CONVERTER_IMAGE_READY = True
def prewarm_converter_image(background: bool = True) -> None:
def _task() -> None:
try:
ensure_converter_image()
print(f"Converter image ready: {CONVERTER_IMAGE}")
except Exception as exc:
print(f"Converter image prewarm failed: {exc}")
if background:
t = threading.Thread(target=_task, name="converter-prewarm", daemon=True)
t.start()
else:
_task()
def convert_pptx_to_pdf_batch(
job_dir: Path,
pptx_paths: list[Path],
progress_cb: Callable[[int, str, str], None] | None = None,
) -> list[Path]:
if not pptx_paths:
return []
pdf_paths = [p.with_suffix(".pdf") for p in pptx_paths]
total = len(pdf_paths)
start_ts = time.time()
last_percent = -1
last_done = -1
if use_local_office_converter():
args = [
str(LOCAL_LIBREOFFICE_BIN),
"--headless",
"--convert-to",
"pdf",
"--outdir",
str(job_dir),
]
args.extend([str(p) for p in pptx_paths])
else:
ensure_converter_image()
uid_gid = f"{os.getuid()}:{os.getgid()}"
workdir = str(job_dir)
args = [
"docker",
"run",
"--rm",
"--user",
uid_gid,
"-v",
f"{workdir}:/work",
CONVERTER_IMAGE,
"libreoffice",
"--headless",
"--convert-to",
"pdf",
"--outdir",
"/work",
]
args.extend([f"/work/{p.name}" for p in pptx_paths])
proc = subprocess.Popen(
args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
while True:
ret = proc.poll()
done = 0
for p in pdf_paths:
if p.exists():
done += 1
# 70~84 之间滚动,避免长时间停在 70% 不动。
percent_by_done = int((done / max(total, 1)) * 14)
percent_by_time = min(13, int((time.time() - start_ts) / 2.0))
percent = min(84, 70 + max(percent_by_done, percent_by_time))
if progress_cb and (done != last_done or percent != last_percent):
progress_cb(percent, "PPT转PDF", f"已转 {done}/{total}")
last_done = done
last_percent = percent
if ret is not None:
break
time.sleep(0.35)
stdout, stderr = proc.communicate()
if proc.returncode != 0:
raise RuntimeError(f"pptx->pdf batch convert failed: {stderr.strip() or stdout.strip()}")
if progress_cb:
progress_cb(84, "PPT转PDF", f"已转 {total}/{total}")
missing = [str(p.name) for p in pdf_paths if not p.exists()]
if missing:
raise RuntimeError(f"pptx->pdf batch convert failed, missing outputs: {', '.join(missing[:5])}")
return pdf_paths
def extract_pdf_pages_to_png_batch(
tasks: list[dict[str, Any]],
config: dict[str, Any] | None = None,
progress_cb: Callable[[int, str, str], None] | None = None,
) -> None:
if not tasks:
return
first_pdf = Path(str(tasks[0]["pdf_path"]))
job_dir = first_pdf.parent
local_pdftoppm = use_local_pdftoppm()
map_file = job_dir / "_extract_map.nul"
with map_file.open("wb") as f:
for t in tasks:
if local_pdftoppm:
pdf_field = str(Path(str(t["pdf_path"])).resolve())
out_field = str(Path(str(t["png_path"])).with_suffix("").resolve())
else:
pdf_field = Path(str(t["pdf_path"])).name
out_field = Path(str(t["png_path"])).with_suffix("").name
pdf_name = pdf_field.encode("utf-8")
page_number = str(int(t["page_number"])).encode("utf-8")
out_prefix = out_field.encode("utf-8")
f.write(pdf_name + b"\0" + page_number + b"\0" + out_prefix + b"\0")
parallel_jobs = resolve_extract_workers(config or {})
if local_pdftoppm:
script = (
"set -e; "
f"xargs -0 -P {parallel_jobs} -n 3 sh -c "
"'pdf=\"$1\"; page=\"$2\"; out=\"$3\"; "
f"\"{LOCAL_PDFTOPPM_BIN}\" -f \"$page\" -singlefile -png \"$pdf\" \"$out\"' _ "
f"< {str(map_file)}"
)
args = ["sh", "-lc", script]
else:
ensure_converter_image()
uid_gid = f"{os.getuid()}:{os.getgid()}"
script = (
"set -e; "
f"xargs -0 -P {parallel_jobs} -n 3 sh -c "
"'pdf=\"$1\"; page=\"$2\"; out=\"$3\"; "
"pdftoppm -f \"$page\" -singlefile -png \"/work/$pdf\" \"/work/$out\"' _ "
"< /work/_extract_map.nul"
)
args = [
"docker",
"run",
"--rm",
"--user",
uid_gid,
"-v",
f"{str(job_dir)}:/work",
CONVERTER_IMAGE,
"sh",
"-lc",
script,
]
proc = subprocess.Popen(
args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
total = len(tasks)
last_done = -1
last_percent = -1
while True:
ret = proc.poll()
done = 0
for t in tasks:
if Path(str(t["png_path"])).exists():
done += 1
percent = min(94, 85 + int((done / max(total, 1)) * 9))
if progress_cb and (done != last_done or percent != last_percent):
progress_cb(percent, "PDF转PNG", f"已导出 {done}/{total}")
last_done = done
last_percent = percent
if ret is not None:
break
time.sleep(0.35)
stdout, stderr = proc.communicate()
map_file.unlink(missing_ok=True)
if proc.returncode != 0:
raise RuntimeError(f"pdf->png batch extract failed: {stderr.strip() or stdout.strip()}")
missing_png = [str(Path(str(t["png_path"])).name) for t in tasks if not Path(str(t["png_path"])).exists()]
if missing_png:
raise RuntimeError(f"pdf->png batch extract failed, missing outputs: {', '.join(missing_png[:5])}")
def cleanup_download_cache() -> None:
now = time.time()
to_delete: list[str] = []
for token, meta in DOWNLOAD_CACHE.items():
created_at = float(meta.get("created_at", 0))
if now - created_at > 3600: # 1 hour
to_delete.append(token)
for token in to_delete:
DOWNLOAD_CACHE.pop(token, None)
def register_download(file_path: Path, content_type: str = "application/octet-stream") -> str:
token = uuid.uuid4().hex
with _DOWNLOAD_LOCK:
cleanup_download_cache()
DOWNLOAD_CACHE[token] = {
"file_path": str(file_path),
"filename": file_path.name,
"content_type": content_type,
"created_at": time.time(),
}
return token
def generate_records(
records: list[dict[str, Any]],
config: dict[str, Any],
template_path: Path,
output_dir: Path,
progress_cb: Callable[[int, str, str], None] | None = None,
) -> dict[str, Any]:
output_cfg = config.get("output_settings", {})
auto_cleanup = bool(output_cfg.get("auto_cleanup_pptx", True)) if isinstance(output_cfg, dict) else True
job_id = f"job_{now_ts()}_{uuid.uuid4().hex[:6]}"
job_dir = output_dir / job_id
job_dir.mkdir(parents=True, exist_ok=True)
register_active_job_dir(job_dir)
try:
generated_items: list[dict[str, Any]] = []
download_images: list[dict[str, Any]] = []
tasks: list[dict[str, Any]] = []
page_template_cache: dict[str, bytes] | None = None
used_names: set[str] = set()
total_records = len(records)
generation_strategy = resolve_generation_strategy(config, total_records)
if generation_strategy == "page_template_cache" and total_records > 0:
page_names = {str(r.get("page", "")).strip() for r in records if str(r.get("page", "")).strip()}
if page_names:
if progress_cb:
progress_cb(14, "准备模板", "构建单页模板缓存")
page_template_cache = build_page_template_cache(template_path, page_names)
if progress_cb:
progress_cb(18, "准备模板", f"已缓存 {len(page_template_cache)} 个页面")
if progress_cb:
progress_cb(20, "生成PPT", f"0/{total_records}")
for i, rec in enumerate(records, start=1):
base_name = safe_filename(str(rec.get("output_file") or f"喜报_{rec.get('branch', '未知网点')}_{i}.png"))
if not base_name.lower().endswith(".png"):
base_name += ".png"
final_name = base_name
seq = 1
while final_name in used_names:
final_name = f"{Path(base_name).stem}_{seq}.png"
seq += 1
used_names.add(final_name)
ppt_name = f"{Path(final_name).stem}.pptx"
ppt_path = job_dir / ppt_name
pdf_path = job_dir / f"{Path(final_name).stem}.pdf"
png_path = job_dir / final_name
tasks.append(
{
"record": rec,
"ppt_path": ppt_path,
"pdf_path": pdf_path,
"png_path": png_path,
"page_number": 1,
"png_name": final_name,
}
)
def _build_one(task: dict[str, Any]) -> None:
rec = dict(task.get("record", {}))
ppt_path = Path(str(task.get("ppt_path")))
try:
export_page_number = build_ppt_for_record(
template_path,
config,
rec,
ppt_path,
page_template_cache=page_template_cache,
)
task["page_number"] = int(export_page_number)
except Exception as exc:
append_review_log(
"generation_error",
{
"reason": "build_ppt_failed",
"source_line": str(rec.get("source_line") or rec.get("raw_text") or ""),
"record": rec,
"error": str(exc),
},
)
raise
max_workers = resolve_build_workers(config, total_records)
if total_records <= 1 or max_workers <= 1:
done = 0
for task in tasks:
_build_one(task)
done += 1
if progress_cb:
pct = 20 + int((done / max(total_records, 1)) * 45)
progress_cb(pct, "生成PPT", f"{done}/{total_records}")
else:
done = 0
with ThreadPoolExecutor(max_workers=max_workers) as ex:
futs = [ex.submit(_build_one, t) for t in tasks]
for fut in as_completed(futs):
fut.result()
done += 1
if progress_cb:
pct = 20 + int((done / max(total_records, 1)) * 45)
progress_cb(pct, "生成PPT", f"{done}/{total_records}")
if progress_cb:
progress_cb(70, "PPT转PDF", f"已转 0/{total_records}")
convert_pptx_to_pdf_batch(
job_dir,
[Path(str(t["ppt_path"])) for t in tasks],
progress_cb=progress_cb,
)
if progress_cb:
progress_cb(85, "PDF转PNG", "已导出 0/{}".format(total_records))
extract_pdf_pages_to_png_batch(tasks, config=config, progress_cb=progress_cb)
for i, t in enumerate(tasks, start=1):
rec = dict(t["record"])
ppt_path = Path(str(t["ppt_path"]))
pdf_path = Path(str(t["pdf_path"]))
png_path = Path(str(t["png_path"]))
if auto_cleanup:
ppt_path.unlink(missing_ok=True)
pdf_path.unlink(missing_ok=True)
item = dict(rec)
item["png_file"] = str(t["png_name"])
item["job_id"] = job_id
item["image_path"] = str(png_path)
token = register_download(png_path, content_type="image/png")
item["download_token"] = token
item["download_url"] = f"/api/download/{token}"
generated_items.append(item)
download_images.append(
{
"name": str(t["png_name"]),
"download_token": token,
"download_url": f"/api/download/{token}",
}
)
if progress_cb:
pct = 95 + int((i / max(total_records, 1)) * 5)
progress_cb(pct, "整理下载", f"{i}/{total_records}")
return {
"job_id": job_id,
"job_dir": str(job_dir),
"generation_strategy": generation_strategy,
"generated": generated_items,
"generated_count": len(generated_items),
"download_images": download_images,
}
finally:
unregister_active_job_dir(job_dir)
reclaim_runtime_memory(config)
class XibaoHandler(SimpleHTTPRequestHandler):
def __init__(self, *args: Any, **kwargs: Any):
super().__init__(*args, directory=str(STATIC_DIR), **kwargs)
def _send_json(self, payload: dict[str, Any], status: HTTPStatus = HTTPStatus.OK) -> None:
data = json.dumps(payload, ensure_ascii=False).encode("utf-8")
self.send_response(status)
self.send_header("Content-Type", "application/json; charset=utf-8")
self.send_header("Content-Length", str(len(data)))
self.end_headers()
self.wfile.write(data)
def _send_bytes(
self,
content: bytes,
content_type: str,
filename: str | None = None,
status: HTTPStatus = HTTPStatus.OK,
) -> None:
self.send_response(status)
self.send_header("Content-Type", content_type)
self.send_header("Content-Length", str(len(content)))
if filename:
encoded = quote(filename)
self.send_header("Content-Disposition", f"attachment; filename*=UTF-8''{encoded}")
self.end_headers()
self.wfile.write(content)
def _send_file(
self,
file_path: Path,
content_type: str,
filename: str | None = None,
status: HTTPStatus = HTTPStatus.OK,
max_kbps: int = 0,
chunk_size: int = 16 * 1024,
) -> None:
total_size = int(file_path.stat().st_size)
self.send_response(status)
self.send_header("Content-Type", content_type)
self.send_header("Content-Length", str(total_size))
if filename:
encoded = quote(filename)
self.send_header("Content-Disposition", f"attachment; filename*=UTF-8''{encoded}")
self.end_headers()
max_bytes_per_sec = max(0, int(max_kbps)) * 1024
block_size = max(4096, int(chunk_size))
sent = 0
start_ts = time.perf_counter()
with file_path.open("rb") as f:
while True:
chunk = f.read(block_size)
if not chunk:
break
self.wfile.write(chunk)
sent += len(chunk)
if max_bytes_per_sec > 0:
elapsed = time.perf_counter() - start_ts
expected_elapsed = sent / max_bytes_per_sec
if expected_elapsed > elapsed:
time.sleep(min(expected_elapsed - elapsed, 0.2))
def _read_json_body(self) -> dict[str, Any]:
try:
content_len = int(self.headers.get("Content-Length", "0"))
except ValueError:
content_len = 0
body = self.rfile.read(content_len) if content_len > 0 else b"{}"
if not body:
return {}
try:
return json.loads(body.decode("utf-8"))
except json.JSONDecodeError:
raise ValueError("invalid JSON body")
def do_GET(self) -> None: # noqa: N802
parsed = urlparse(self.path)
if handle_get_routes(self, parsed, globals()):
return
if parsed.path == "/":
self.path = "/index.html"
super().do_GET()
def do_POST(self) -> None: # noqa: N802
parsed = urlparse(self.path)
if handle_post_routes(self, parsed, globals()):
return
self._send_json({"ok": False, "error": "not found"}, HTTPStatus.NOT_FOUND)
def run_server(
host: str = "0.0.0.0",
port: int = 8787,
prewarm: bool = True,
prewarm_blocking: bool = False,
) -> None:
if prewarm:
prewarm_converter_image(background=not prewarm_blocking)
cleanup_stop = threading.Event()
cleanup_thread = threading.Thread(
target=daily_cleanup_loop,
args=(cleanup_stop,),
name="daily-cleanup",
daemon=True,
)
cleanup_thread.start()
server = ThreadingHTTPServer((host, port), XibaoHandler)
print(f"Xibao Web running on http://{host}:{port}")
try:
server.serve_forever()
except KeyboardInterrupt:
pass
finally:
cleanup_stop.set()
cleanup_thread.join(timeout=1.0)
server.server_close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Xibao Web server")
parser.add_argument("--host", default="0.0.0.0", help="bind host, e.g. 0.0.0.0 or 127.0.0.1")
parser.add_argument("--port", type=int, default=8787, help="bind port")
parser.add_argument(
"--skip-prewarm",
action="store_true",
help="disable docker converter image prewarm on startup",
)
parser.add_argument(
"--prewarm-blocking",
action="store_true",
help="prewarm converter image before serving (blocks startup until ready)",
)
args = parser.parse_args()
run_server(
host=args.host,
port=args.port,
prewarm=not args.skip_prewarm,
prewarm_blocking=args.prewarm_blocking,
)