#!/usr/bin/env python3 from __future__ import annotations import argparse from concurrent.futures import ThreadPoolExecutor, as_completed import ctypes import gc import hashlib import io import json import os import re import shutil import subprocess import threading import time import uuid from datetime import datetime, timedelta from http import HTTPStatus from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer from pathlib import Path from typing import Any, Callable from urllib.parse import quote, urlparse from pptx.enum.text import PP_ALIGN from pptx import Presentation from api_get_routes import handle_get_routes from api_post_routes import handle_post_routes BASE_DIR = Path(__file__).resolve().parent STATIC_DIR = BASE_DIR / "static" CONFIG_PATH = BASE_DIR / "config" / "xibao_logic.json" DATA_DIR = BASE_DIR / "data" DEFAULT_HISTORY_PATH = DATA_DIR / "generated_history.json" DEFAULT_OUTPUT_DIR = BASE_DIR / "output" DEFAULT_TEMPLATE_FALLBACK = BASE_DIR.parent / "黄金三十天喜报模版(余额、保险、理财)(1).pptx" REVIEW_LOG_DIR = DATA_DIR / "review_logs" MANUAL_RULES_PATH = DATA_DIR / "manual_rules.json" ISSUE_MARKS_PATH = DATA_DIR / "marked_issues.json" SKIPPED_SUPPRESS_PATH = DATA_DIR / "skipped_suppressions.json" CONVERTER_IMAGE = "minidocks/libreoffice" LOCAL_LIBREOFFICE_BIN = shutil.which("libreoffice") or shutil.which("soffice") LOCAL_PDFTOPPM_BIN = shutil.which("pdftoppm") DATA_DIR.mkdir(parents=True, exist_ok=True) DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) REVIEW_LOG_DIR.mkdir(parents=True, exist_ok=True) _HISTORY_LOCK = threading.Lock() _DOWNLOAD_LOCK = threading.Lock() _CONVERTER_LOCK = threading.Lock() _LOG_LOCK = threading.Lock() _PROGRESS_LOCK = threading.Lock() _RULES_LOCK = threading.Lock() _ISSUE_LOCK = threading.Lock() _SKIP_SUPPRESS_LOCK = threading.Lock() _ACTIVE_JOB_LOCK = threading.Lock() _GENERATE_SLOT_LOCK = threading.Lock() _GENERATE_STATE_LOCK = threading.Lock() DOWNLOAD_CACHE: dict[str, dict[str, Any]] = {} GEN_PROGRESS: dict[str, dict[str, Any]] = {} ACTIVE_JOB_DIRS: set[str] = set() ACTIVE_GENERATION: dict[str, Any] = {"token": "", "started_at": 0.0} CONVERTER_IMAGE_READY = False _MALLOC_TRIM_FN: Callable[[int], int] | None = None TERM_DEFS = [ { "type": "三个月定期", "page": "page_0", "regex": re.compile(r"(?:三|3)\s*个?月"), }, { "type": "六个月定期", "page": "page_1", "regex": re.compile(r"(?:六|6)\s*个?月|半年"), }, { "type": "一年期定期", "page": "page_2", "regex": re.compile(r"(?:定期)?\s*(?:存\s*)?(?:一|1)\s*年(?:期)?|定期一年|存1年"), }, ] CHINESE_DIGIT_MAP = { "零": 0, "〇": 0, "一": 1, "二": 2, "两": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9, "壹": 1, "贰": 2, "叁": 3, "肆": 4, "伍": 5, "陆": 6, "柒": 7, "捌": 8, "玖": 9, } CHINESE_UNIT_MAP = {"十": 10, "百": 100, "千": 1000, "万": 10000, "亿": 100000000} CHINESE_NUMBER_RE = re.compile(r"[零〇一二两三四五六七八九十百千万亿壹贰叁肆伍陆柒捌玖拾佰仟点點]+") STATUS_HEURISTIC_PATTERNS: dict[str, list[re.Pattern[str]]] = { "揽收现金": [ re.compile(r"揽收[^,。,;;\n]{0,24}现金"), ], "揽收彩礼": [ re.compile(r"揽收[^,。,;;\n]{0,24}(?:彩礼|礼金)"), re.compile(r"(?:彩礼|礼金)[^,。,;;\n]{0,24}揽收"), ], "微信提现": [ re.compile(r"(?:微信|支付宝)[^,。,;;\n]{0,16}提现"), re.compile(r"提现[^,。,;;\n]{0,16}(?:微信|支付宝)"), ], "商户提现": [ re.compile(r"商户[^,。,;;\n]{0,16}提现"), re.compile(r"提现[^,。,;;\n]{0,16}商户"), ], "揽收商户": [ re.compile(r"揽收[^,。,;;\n]{0,16}商户"), re.compile(r"商户[^,。,;;\n]{0,16}揽收"), ], "揽收他行": [ re.compile(r"揽收[^,。,;;\n]{0,20}他行"), re.compile(r"他行[^,。,;;\n]{0,20}揽收"), ], "他行挖转": [ re.compile(r"挖转[^,。,;;\n]{0,20}他行"), re.compile(r"他行[^,。,;;\n]{0,20}挖转"), re.compile(r"挖他行"), ], } class ConfigError(RuntimeError): pass def now_ts() -> str: return datetime.now().strftime("%Y%m%d_%H%M%S") def load_config() -> dict[str, Any]: if not CONFIG_PATH.exists(): raise ConfigError(f"config file not found: {CONFIG_PATH}") with CONFIG_PATH.open("r", encoding="utf-8") as f: config = json.load(f) if not isinstance(config, dict): raise ConfigError("config root must be a JSON object") return config def is_windows_path(path_text: str) -> bool: return bool(re.match(r"^[A-Za-z]:\\", path_text)) def resolve_history_path(config: dict[str, Any]) -> Path: history_file = str(config.get("history_file", "")).strip() if history_file and not is_windows_path(history_file): p = Path(history_file) if not p.is_absolute(): p = BASE_DIR / p p.parent.mkdir(parents=True, exist_ok=True) return p DEFAULT_HISTORY_PATH.parent.mkdir(parents=True, exist_ok=True) return DEFAULT_HISTORY_PATH def resolve_template_path(config: dict[str, Any], override_path: str | None = None) -> Path: candidates: list[Path] = [] if override_path: p = Path(override_path) if not p.is_absolute(): p = BASE_DIR / p candidates.append(p) cfg_template = str(config.get("template_file", "")).strip() if cfg_template and not is_windows_path(cfg_template): p = Path(cfg_template) if not p.is_absolute(): p = BASE_DIR / p candidates.append(p) env_template = os.environ.get("XIBAO_TEMPLATE_FILE", "").strip() if env_template: p = Path(env_template) if not p.is_absolute(): p = BASE_DIR / p candidates.append(p) candidates.append(DEFAULT_TEMPLATE_FALLBACK) for p in candidates: if p.exists() and p.is_file(): return p raise ConfigError( "template file not found. Set config.template_file to a valid local path " "or pass template_file in request body." ) def resolve_output_dir(config: dict[str, Any], override_path: str | None = None) -> Path: if override_path: p = Path(override_path) if not p.is_absolute(): p = BASE_DIR / p p.mkdir(parents=True, exist_ok=True) return p output_dir = str(config.get("output_dir", "")).strip() if output_dir and not is_windows_path(output_dir): p = Path(output_dir) if not p.is_absolute(): p = BASE_DIR / p p.mkdir(parents=True, exist_ok=True) return p DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) return DEFAULT_OUTPUT_DIR def load_history(history_path: Path) -> list[dict[str, Any]]: if not history_path.exists(): return [] with history_path.open("r", encoding="utf-8") as f: raw = json.load(f) if isinstance(raw, list): return [r for r in raw if isinstance(r, dict)] if isinstance(raw, dict) and isinstance(raw.get("records"), list): return [r for r in raw["records"] if isinstance(r, dict)] return [] def save_history(history_path: Path, records: list[dict[str, Any]]) -> None: with history_path.open("w", encoding="utf-8") as f: json.dump(records, f, ensure_ascii=False, indent=2) def load_manual_rules() -> list[dict[str, Any]]: if not MANUAL_RULES_PATH.exists(): return [] with _RULES_LOCK: try: with MANUAL_RULES_PATH.open("r", encoding="utf-8") as f: raw = json.load(f) except Exception: return [] if not isinstance(raw, list): return [] return [x for x in raw if isinstance(x, dict)] def save_manual_rules(rules: list[dict[str, Any]]) -> None: with _RULES_LOCK: with MANUAL_RULES_PATH.open("w", encoding="utf-8") as f: json.dump(rules, f, ensure_ascii=False, indent=2) def _load_issue_marks_unlocked() -> list[dict[str, Any]]: if not ISSUE_MARKS_PATH.exists(): return [] try: with ISSUE_MARKS_PATH.open("r", encoding="utf-8") as f: raw = json.load(f) except Exception: return [] if not isinstance(raw, list): return [] return [x for x in raw if isinstance(x, dict)] def _save_issue_marks_unlocked(items: list[dict[str, Any]]) -> None: ISSUE_MARKS_PATH.parent.mkdir(parents=True, exist_ok=True) with ISSUE_MARKS_PATH.open("w", encoding="utf-8") as f: json.dump(items, f, ensure_ascii=False, indent=2) def list_issue_marks(status: str = "active", limit: int = 500) -> tuple[list[dict[str, Any]], int]: status_norm = str(status or "active").strip().lower() if status_norm not in {"active", "resolved", "all"}: status_norm = "active" limit = max(1, min(2000, int(limit))) with _ISSUE_LOCK: items = _load_issue_marks_unlocked() normalized: list[dict[str, Any]] = [] for item in items: obj = dict(item) item_status = str(obj.get("status", "active")).strip().lower() if item_status not in {"active", "resolved"}: item_status = "active" obj["status"] = item_status normalized.append(obj) if status_norm == "all": filtered = normalized else: filtered = [x for x in normalized if str(x.get("status", "active")) == status_norm] filtered.sort( key=lambda x: str(x.get("updated_at") or x.get("created_at") or ""), reverse=True, ) total = len(filtered) return filtered[:limit], total def upsert_issue_mark( *, mark_type: str, source_line: str, note: str = "", record: dict[str, Any] | None = None, ) -> tuple[dict[str, Any], bool]: mark_type_text = str(mark_type).strip() if mark_type_text not in {"recognition_error", "generation_error"}: raise ValueError("mark_type must be recognition_error or generation_error") line_text = str(source_line).strip() if not line_text: raise ValueError("source_line is required") now = datetime.now().isoformat(timespec="seconds") record_obj = dict(record) if isinstance(record, dict) else {} note_text = str(note).strip() with _ISSUE_LOCK: items = _load_issue_marks_unlocked() for item in items: if not isinstance(item, dict): continue if str(item.get("status", "active")).strip().lower() != "active": continue if str(item.get("mark_type", "")).strip() != mark_type_text: continue if str(item.get("source_line", "")).strip() != line_text: continue item["note"] = note_text item["record"] = record_obj item["updated_at"] = now _save_issue_marks_unlocked(items) return dict(item), False issue = { "id": uuid.uuid4().hex[:12], "mark_type": mark_type_text, "source_line": line_text, "note": note_text, "record": record_obj, "status": "active", "created_at": now, "updated_at": now, } items.append(issue) _save_issue_marks_unlocked(items) return dict(issue), True def update_issue_mark( *, issue_id: str, mark_type: str | None = None, source_line: str | None = None, note: str | None = None, record: dict[str, Any] | None = None, ) -> dict[str, Any] | None: issue_id_text = str(issue_id).strip() if not issue_id_text: raise ValueError("id is required") now = datetime.now().isoformat(timespec="seconds") with _ISSUE_LOCK: items = _load_issue_marks_unlocked() target = None for item in items: if str(item.get("id", "")).strip() == issue_id_text: target = item break if target is None: return None if mark_type is not None: mark_type_text = str(mark_type).strip() if mark_type_text not in {"recognition_error", "generation_error"}: raise ValueError("mark_type must be recognition_error or generation_error") target["mark_type"] = mark_type_text if source_line is not None: source_line_text = str(source_line).strip() if not source_line_text: raise ValueError("source_line is required") target["source_line"] = source_line_text if note is not None: target["note"] = str(note).strip() if record is not None: target["record"] = dict(record) if isinstance(record, dict) else {} target["updated_at"] = now _save_issue_marks_unlocked(items) return dict(target) def delete_issue_mark(issue_id: str) -> bool: issue_id_text = str(issue_id).strip() if not issue_id_text: raise ValueError("id is required") with _ISSUE_LOCK: items = _load_issue_marks_unlocked() remain: list[dict[str, Any]] = [] deleted = False for item in items: if str(item.get("id", "")).strip() == issue_id_text: deleted = True continue remain.append(item) if deleted: _save_issue_marks_unlocked(remain) return deleted def resolve_issue_marks_by_source_line(source_line: str, reason: str = "") -> dict[str, Any]: line_text = str(source_line).strip() if not line_text: return {"count": 0, "ids": []} now = datetime.now().isoformat(timespec="seconds") resolved_ids: list[str] = [] with _ISSUE_LOCK: items = _load_issue_marks_unlocked() changed = False for item in items: if str(item.get("status", "active")).strip().lower() != "active": continue if str(item.get("source_line", "")).strip() != line_text: continue item["status"] = "resolved" item["resolved_at"] = now item["updated_at"] = now if reason: item["resolved_reason"] = reason resolved_ids.append(str(item.get("id", "")).strip()) changed = True if changed: _save_issue_marks_unlocked(items) return {"count": len([x for x in resolved_ids if x]), "ids": [x for x in resolved_ids if x]} def normalize_skip_line(line: str) -> str: return re.sub(r"\s+", "", str(line or "").strip()) def skip_suppression_key(line: str, reason: str = "") -> str: normalized_line = normalize_skip_line(line) normalized_reason = str(reason or "").strip() if not normalized_line: return "" return f"{normalized_reason}|{normalized_line}" def skip_suppression_keys_for_item(line: str, reason: str = "") -> list[str]: normalized_line = normalize_skip_line(line) if not normalized_line: return [] normalized_reason = str(reason or "").strip() keys = [f"*|{normalized_line}"] if normalized_reason: keys.insert(0, f"{normalized_reason}|{normalized_line}") return keys def _load_skip_suppressions_unlocked() -> list[dict[str, Any]]: if not SKIPPED_SUPPRESS_PATH.exists(): return [] try: with SKIPPED_SUPPRESS_PATH.open("r", encoding="utf-8") as f: raw = json.load(f) except Exception: return [] if not isinstance(raw, list): return [] return [x for x in raw if isinstance(x, dict)] def _save_skip_suppressions_unlocked(items: list[dict[str, Any]]) -> None: SKIPPED_SUPPRESS_PATH.parent.mkdir(parents=True, exist_ok=True) with SKIPPED_SUPPRESS_PATH.open("w", encoding="utf-8") as f: json.dump(items, f, ensure_ascii=False, indent=2) def list_skip_suppressions() -> list[dict[str, Any]]: with _SKIP_SUPPRESS_LOCK: return _load_skip_suppressions_unlocked() def build_skip_suppression_lookup(items: list[dict[str, Any]]) -> set[str]: keys: set[str] = set() for item in items: if not isinstance(item, dict): continue key = str(item.get("key", "")).strip() if key: keys.add(key) continue line = str(item.get("line", "")).strip() reason = str(item.get("reason", "")).strip() fallback = skip_suppression_key(line, reason) if fallback: keys.add(fallback) return keys def is_skip_item_suppressed(line: str, reason: str, lookup: set[str]) -> bool: for key in skip_suppression_keys_for_item(line, reason): if key in lookup: return True return False def suppress_skip_item(line: str, reason: str = "") -> tuple[dict[str, Any], bool]: line_text = str(line or "").strip() if not line_text: raise ValueError("line is required") reason_text = str(reason or "").strip() # "*" means suppress the same line regardless of reason. normalized_reason = reason_text if reason_text else "*" key = skip_suppression_key(line_text, normalized_reason) if not key: raise ValueError("line is required") now = datetime.now().isoformat(timespec="seconds") with _SKIP_SUPPRESS_LOCK: items = _load_skip_suppressions_unlocked() for item in items: if str(item.get("key", "")).strip() != key: continue item["updated_at"] = now _save_skip_suppressions_unlocked(items) return dict(item), False obj = { "id": uuid.uuid4().hex[:12], "key": key, "line": line_text, "reason": normalized_reason, "created_at": now, "updated_at": now, } items.append(obj) _save_skip_suppressions_unlocked(items) return obj, True def clear_skip_suppressions() -> int: with _SKIP_SUPPRESS_LOCK: items = _load_skip_suppressions_unlocked() count = len(items) _save_skip_suppressions_unlocked([]) return count def register_active_job_dir(job_dir: Path) -> None: with _ACTIVE_JOB_LOCK: ACTIVE_JOB_DIRS.add(str(job_dir.resolve())) def unregister_active_job_dir(job_dir: Path) -> None: with _ACTIVE_JOB_LOCK: ACTIVE_JOB_DIRS.discard(str(job_dir.resolve())) def get_active_job_dirs() -> set[str]: with _ACTIVE_JOB_LOCK: return set(ACTIVE_JOB_DIRS) def acquire_generation_slot(token: str = "") -> bool: if not _GENERATE_SLOT_LOCK.acquire(blocking=False): return False with _GENERATE_STATE_LOCK: ACTIVE_GENERATION["token"] = str(token).strip() ACTIVE_GENERATION["started_at"] = time.time() return True def release_generation_slot(token: str = "") -> None: _ = token with _GENERATE_STATE_LOCK: ACTIVE_GENERATION["token"] = "" ACTIVE_GENERATION["started_at"] = 0.0 if _GENERATE_SLOT_LOCK.locked(): try: _GENERATE_SLOT_LOCK.release() except RuntimeError: pass def get_active_generation() -> dict[str, Any]: with _GENERATE_STATE_LOCK: return dict(ACTIVE_GENERATION) def today_log_path() -> Path: day = datetime.now().strftime("%Y-%m-%d") return REVIEW_LOG_DIR / f"review_{day}.jsonl" def count_log_lines(path: Path) -> int: if not path.exists(): return 0 n = 0 with path.open("r", encoding="utf-8") as f: for _ in f: n += 1 return n def append_review_log(event: str, payload: dict[str, Any] | None = None) -> Path: REVIEW_LOG_DIR.mkdir(parents=True, exist_ok=True) log_path = today_log_path() entry = { "ts": datetime.now().isoformat(timespec="seconds"), "event": str(event), } if isinstance(payload, dict): entry.update(payload) with _LOG_LOCK: with log_path.open("a", encoding="utf-8") as f: f.write(json.dumps(entry, ensure_ascii=False)) f.write("\n") return log_path def get_valid_status_list(config: dict[str, Any]) -> list[str]: valid_status = config.get("replace_algorithm", {}).get("status", {}).get("valid_status", []) if not isinstance(valid_status, list) or not valid_status: valid_status = config.get("status_extraction", {}).get("valid_status", []) if not isinstance(valid_status, list): return [] return [str(x).strip() for x in valid_status if str(x).strip()] def normalize_status_value(status: str, config: dict[str, Any]) -> str: fallback = str(config.get("status_extraction", {}).get("fallback", "成功营销")).strip() or "成功营销" valid_status = get_valid_status_list(config) if not valid_status: return str(status).strip() or fallback s = str(status).strip() if s in valid_status: return s return fallback if fallback in valid_status else valid_status[0] def is_date_header_line(normalized_line: str) -> bool: return bool(re.fullmatch(r"\d{1,2}月\d{1,2}[日号]?", normalized_line)) def log_parse_skipped(skipped: list[dict[str, Any]], source: str) -> int: written = 0 for item in skipped: if not isinstance(item, dict): continue reason = str(item.get("reason", "")).strip() # 说明行不记录,避免日志噪音。 if not reason or reason == "skip_line_rule": continue source_line = str(item.get("line", "")).strip() append_review_log( "parse_skip", { "source": source, "reason": reason, "source_line": source_line, }, ) written += 1 return written def is_path_under(path: Path, parent: Path) -> bool: try: path.resolve().relative_to(parent.resolve()) return True except Exception: return False def cleanup_output_artifacts(output_dir: Path) -> dict[str, int]: output_dir.mkdir(parents=True, exist_ok=True) removed_dirs = 0 removed_files = 0 errors = 0 active_dirs = get_active_job_dirs() for entry in output_dir.iterdir(): try: if entry.is_dir() and entry.name.startswith("job_"): if str(entry.resolve()) in active_dirs: continue shutil.rmtree(entry) removed_dirs += 1 continue if entry.is_file() and entry.suffix.lower() in {".png", ".pdf", ".pptx", ".zip"}: entry.unlink(missing_ok=True) removed_files += 1 except Exception: errors += 1 with _DOWNLOAD_LOCK: stale_tokens: list[str] = [] for token, meta in DOWNLOAD_CACHE.items(): file_path = Path(str(meta.get("file_path", ""))) if not file_path.exists() or is_path_under(file_path, output_dir): stale_tokens.append(token) for token in stale_tokens: DOWNLOAD_CACHE.pop(token, None) return { "removed_dirs": removed_dirs, "removed_files": removed_files, "errors": errors, } def daily_cleanup_loop(stop_event: threading.Event) -> None: while not stop_event.is_set(): now = datetime.now() next_midnight = (now + timedelta(days=1)).replace(hour=0, minute=0, second=0, microsecond=0) wait_seconds = max((next_midnight - now).total_seconds(), 1.0) if stop_event.wait(wait_seconds): break try: config = load_config() output_dir = resolve_output_dir(config) stat = cleanup_output_artifacts(output_dir) print( "[daily-cleanup] done at midnight, " f"removed_dirs={stat['removed_dirs']} removed_files={stat['removed_files']} errors={stat['errors']}" ) append_review_log( "daily_cleanup", { "output_dir": str(output_dir), **stat, }, ) except Exception as exc: print(f"[daily-cleanup] failed: {exc}") append_review_log("daily_cleanup_error", {"error": str(exc)}) def build_history_lookup(history: list[dict[str, Any]], key_fields: list[str]) -> dict[str, dict[str, Any]]: lookup: dict[str, dict[str, Any]] = {} for item in history: if not isinstance(item, dict): continue k = key_for_record(item, key_fields) if not k: continue existing = lookup.get(k) if existing is None: lookup[k] = item continue prev_ts = str(existing.get("created_at", "")) curr_ts = str(item.get("created_at", "")) if curr_ts >= prev_ts: lookup[k] = item return lookup def build_history_signature_lookup(history: list[dict[str, Any]]) -> dict[str, dict[str, Any]]: lookup: dict[str, dict[str, Any]] = {} for item in history: if not isinstance(item, dict): continue k = signature_key_for_record(item) if not k: continue existing = lookup.get(k) if existing is None: lookup[k] = item continue prev_ts = str(existing.get("created_at", "")) curr_ts = str(item.get("created_at", "")) if curr_ts >= prev_ts: lookup[k] = item return lookup def resolve_history_image_path(history_record: dict[str, Any], config: dict[str, Any]) -> Path | None: candidates: list[Path] = [] image_path = str(history_record.get("image_path", "")).strip() if image_path: p = Path(image_path) if not p.is_absolute(): p = BASE_DIR / p candidates.append(p) output_dir = resolve_output_dir(config) png_file = str(history_record.get("png_file", "")).strip() output_file = str(history_record.get("output_file", "")).strip() job_id = str(history_record.get("job_id", "")).strip() if png_file and job_id: candidates.append(output_dir / job_id / png_file) if output_file and job_id: candidates.append(output_dir / job_id / output_file) if output_file: # 兼容老历史记录:仅存 output_file 时尝试在 job_* 中定位。 for p in output_dir.glob(f"job_*/{output_file}"): candidates.append(p) break seen: set[str] = set() for c in candidates: key = str(c) if key in seen: continue seen.add(key) if c.exists() and c.is_file(): return c return None def attach_duplicate_download_info( dup_record: dict[str, Any], history_record: dict[str, Any] | None, config: dict[str, Any], ) -> None: if not isinstance(history_record, dict): return image_path = resolve_history_image_path(history_record, config) if not image_path: return token = register_download(image_path, content_type="image/png") dup_record["download_token"] = token dup_record["download_url"] = f"/api/download/{token}" dup_record["image_name"] = image_path.name if not dup_record.get("output_file"): dup_record["output_file"] = image_path.name def build_history_view_items( history: list[dict[str, Any]], config: dict[str, Any], limit: int = 300, ) -> list[dict[str, Any]]: items = [x for x in history if isinstance(x, dict)] items.sort(key=lambda x: str(x.get("created_at", "")), reverse=True) if limit > 0: items = items[:limit] rows: list[dict[str, Any]] = [] for item in items: row = dict(item) image_path = resolve_history_image_path(row, config) if image_path: token = register_download(image_path, content_type="image/png") row["download_token"] = token row["download_url"] = f"/api/download/{token}" row["image_exists"] = True row["image_name"] = image_path.name else: row["image_exists"] = False rows.append(row) return rows def _cleanup_progress_cache_locked() -> None: now = time.time() stale: list[str] = [] for token, item in GEN_PROGRESS.items(): updated_at = float(item.get("updated_at", 0)) if now - updated_at > 3 * 3600: stale.append(token) for token in stale: GEN_PROGRESS.pop(token, None) def cleanup_progress_cache() -> None: with _PROGRESS_LOCK: _cleanup_progress_cache_locked() def set_generation_progress( token: str, *, status: str, stage: str, percent: int, detail: str = "", error: str = "", ) -> None: t = str(token).strip() if not t: return now_iso = datetime.now().isoformat(timespec="seconds") item = { "token": t, "status": status, "stage": stage, "percent": max(0, min(100, int(percent))), "detail": detail, "error": error, "updated_at": time.time(), "updated_at_text": now_iso, } with _PROGRESS_LOCK: _cleanup_progress_cache_locked() GEN_PROGRESS[t] = item def get_generation_progress(token: str) -> dict[str, Any] | None: t = str(token).strip() if not t: return None with _PROGRESS_LOCK: _cleanup_progress_cache_locked() item = GEN_PROGRESS.get(t) if item is None: return None return dict(item) def strip_trailing_zero(num_text: str) -> str: if "." not in num_text: return num_text return num_text.rstrip("0").rstrip(".") def floor_amount_number(value: Any) -> int | None: s = str(value).strip() if not s: return None m = re.search(r"\d+(?:\.\d+)?", s) if not m: return None try: num = float(m.group(0)) except ValueError: return None return max(0, int(num)) def normalize_amount_text(value: Any) -> str: v = floor_amount_number(value) if v is None: return "" return f"{v}万" def normalize_line(line: str, line_pattern: str) -> str: line = line.strip() if not line: return "" try: line = re.sub(line_pattern, "", line) except re.error: pass line = re.sub(r"\s+", "", line) return line.strip() def extract_line_serial(source_line: str, line_pattern: str) -> str: line = str(source_line).strip() if not line: return "" # Prefer extracting from configured line prefix regex. try: m = re.match(line_pattern, line) if m: prefix = m.group(0) dm = re.search(r"\d+", prefix) if dm: return str(int(dm.group(0))) except re.error: pass # Fallback: common numbering formats (e.g. "12、", "12.", "12)"). dm = re.match(r"^\s*(\d+)\s*(?:[、,,..\))]\s*)?", line) if dm: return str(int(dm.group(1))) return "" def line_signature(normalized_line: str) -> str: text = re.sub(r"\s+", "", str(normalized_line).strip()) if not text: return "" digest = hashlib.sha1(text.encode("utf-8")).hexdigest() return digest[:16] def extract_branch(line: str, config: dict[str, Any]) -> tuple[str | None, str | None]: branches_cfg = config.get("branches", {}) allowed = branches_cfg.get("allowed", []) alias = branches_cfg.get("alias", {}) candidates: list[str] = [] if isinstance(alias, dict): candidates.extend(str(k) for k in alias.keys()) if isinstance(allowed, list): candidates.extend(str(b) for b in allowed) candidates = sorted(set(candidates), key=len, reverse=True) raw_branch = None for name in candidates: if name and name in line: raw_branch = name break if raw_branch is None: return None, None normalized = str(alias.get(raw_branch, raw_branch)) if isinstance(alias, dict) else raw_branch return raw_branch, normalized def match_status_heuristic(line: str, status: str) -> bool: patterns = STATUS_HEURISTIC_PATTERNS.get(str(status).strip(), []) for pattern in patterns: if pattern.search(line): return True return False def extract_status(line: str, config: dict[str, Any]) -> str: # Normalize common wording variants before status matching. status_line = str(line or "").replace("外行", "他行") cfg = config.get("status_extraction", {}) rules = cfg.get("extraction_rules", {}) order = cfg.get("priority_order", []) fallback = str(cfg.get("fallback", "成功营销")) if not isinstance(rules, dict): return fallback if not isinstance(order, list) or not order: order = [str(k) for k in rules.keys()] for status in order: keywords = rules.get(status, [status]) if not isinstance(keywords, list): continue for kw in keywords: kw_text = str(kw) if kw_text and kw_text in status_line: return normalize_status_value(str(status), config) valid_status = set(get_valid_status_list(config)) for status in order: s = str(status).strip() if not s: continue if valid_status and s not in valid_status: continue if match_status_heuristic(status_line, s): return normalize_status_value(s, config) # Heuristic: plain "提现" defaults to 微信提现 unless it clearly indicates 商户. if "提现" in status_line: if "商户" in status_line and "商户提现" in valid_status: return "商户提现" if "微信提现" in valid_status: return "微信提现" # Heuristic: "揽收他行40万存一年" 这类语句不能回退到“成功营销”。 if "他行" in status_line: if "揽收他行" in valid_status: return "揽收他行" if "他行挖转" in valid_status: return "他行挖转" return normalize_status_value(fallback, config) def normalize_chinese_number_text(text: str) -> str: return ( str(text) .strip() .replace("點", "点") .replace("拾", "十") .replace("佰", "百") .replace("仟", "千") ) def chinese_integer_to_float(text: str) -> float | None: s = normalize_chinese_number_text(text) if not s: return None total = 0 section = 0 number = 0 seen = False for ch in s: if ch in CHINESE_DIGIT_MAP: number = CHINESE_DIGIT_MAP[ch] seen = True continue unit = CHINESE_UNIT_MAP.get(ch) if unit is None: return None seen = True if unit in {10, 100, 1000}: if number == 0: number = 1 section += number * unit number = 0 continue section += number if section == 0: section = 1 total += section * unit section = 0 number = 0 if not seen: return None return float(total + section + number) def chinese_text_to_float(text: str) -> float | None: s = normalize_chinese_number_text(text) if not s: return None if "点" in s: int_part, frac_part = s.split("点", 1) if not frac_part: return None int_value = chinese_integer_to_float(int_part) if int_part else 0.0 if int_value is None: return None frac_digits: list[str] = [] for ch in frac_part: digit = CHINESE_DIGIT_MAP.get(ch) if digit is None: return None frac_digits.append(str(digit)) return float(f"{int(int_value)}.{''.join(frac_digits)}") return chinese_integer_to_float(s) def chinese_digit_fraction(text: str) -> float | None: s = normalize_chinese_number_text(text) if not s: return None if not all(ch in CHINESE_DIGIT_MAP for ch in s): return None digits = "".join(str(CHINESE_DIGIT_MAP[ch]) for ch in s) return int(digits) / (10 ** len(digits)) def parse_wan_suffix_shorthand(text: str) -> float | None: s = normalize_chinese_number_text(text) if not s: return 0.0 if any(ch in s for ch in ("十", "百", "千", "万", "亿")): return None if "点" in s: right_val = chinese_text_to_float(s) if right_val is None: return None # e.g. "五点五" after "万" => 0.55 万 return right_val / 10.0 return chinese_digit_fraction(s) def parse_chinese_amount_value(raw_text: str, right_1: str, right_2: str) -> float | None: token = normalize_chinese_number_text(raw_text) if not token: return None has_digit = any(ch in CHINESE_DIGIT_MAP for ch in token) has_small_unit = any(ch in token for ch in ("十", "百", "千")) if not has_digit and not has_small_unit: return None has_wan = ("万" in token) or (right_1 == "万") or right_2.startswith("万元") if has_wan: if "万" in token: left, right = token.split("万", 1) left_text = left.strip() right_text = right.strip() left_value = chinese_text_to_float(left_text) if left_text else 0.0 if left_value is None: return None if not right_text: return left_value right_norm = normalize_chinese_number_text(right_text) # Full unit expression (e.g. 两万五千、两万零五百) -> convert absolute value to "万". if any(ch in right_norm for ch in ("十", "百", "千", "万", "亿")): absolute = chinese_text_to_float(token) if absolute is None: return None return absolute / 10000.0 # Colloquial shorthand (e.g. 两万五 -> 2.5万, 一万二 -> 1.2万). shorthand = parse_wan_suffix_shorthand(right_norm) if shorthand is not None: return left_value + shorthand right_value = chinese_text_to_float(right_norm) if right_value is None: return left_value return left_value + (right_value / 10.0) # e.g. "五 万" where token may be "五" value = chinese_text_to_float(token) return value return chinese_text_to_float(token) def extract_amount_candidates(text: str) -> list[dict[str, Any]]: candidates: list[dict[str, Any]] = [] for m in re.finditer(r"\d+(?:\.\d+)?", text): num_text = m.group(0) start, end = m.span() right_2 = text[end : end + 2] right_1 = text[end : end + 1] # Exclude time/date/tenor hints: 2月23日, 5年交, 6个月 ... if right_1 in {"年", "月", "期", "号"}: continue if right_2.startswith("个月"): continue has_wan = False if right_1 in {"万", "W", "w"} or right_2.startswith("万元"): has_wan = True try: value = float(num_text) except ValueError: continue candidates.append( { "value": value, "num": strip_trailing_zero(num_text), "start": start, "end": end, "has_wan": has_wan, } ) for m in CHINESE_NUMBER_RE.finditer(text): raw_text = m.group(0) start, end = m.span() right_2 = text[end : end + 2] right_1 = text[end : end + 1] if right_1 in {"年", "月", "期", "号"}: continue if right_2.startswith("个月"): continue # Avoid treating single Chinese digits in names as amount, e.g. "钱七/周八/吴九". # Keep unit-bearing forms (e.g. "十万", "五万元") untouched. if ( len(raw_text) == 1 and raw_text in CHINESE_DIGIT_MAP and right_1 not in {"万", "元"} and not right_2.startswith("万元") ): continue value = parse_chinese_amount_value(raw_text, right_1, right_2) if value is None: continue has_wan = ("万" in raw_text) or (right_1 == "万") or right_2.startswith("万元") candidates.append( { "value": value, "num": strip_trailing_zero(str(value)), "start": start, "end": end, "has_wan": has_wan, } ) return candidates def amount_num_to_text(num_text: str) -> str: v = floor_amount_number(num_text) if v is None: return "" return f"{v}万" def pick_amount_from_candidates( text: str, candidates: list[dict[str, Any]], anchor_pos: int | None = None, ) -> str | None: if not candidates: return None has_wan_candidates = [c for c in candidates if bool(c.get("has_wan"))] if anchor_pos is None: # Prefer "合计/共计" amount, otherwise largest amount. preferred = has_wan_candidates or candidates for c in preferred: near = text[max(0, c["start"] - 2) : c["start"] + 2] if "合计" in near or "共计" in near: return amount_num_to_text(c["num"]) best = max(preferred, key=lambda x: x["value"]) return amount_num_to_text(best["num"]) def score(c: dict[str, Any]) -> tuple[float, int]: distance = abs(c["start"] - anchor_pos) left = text[max(0, c["start"] - 3) : c["start"]] if "合计" in left or "共计" in left: distance -= 3 # Prefer value on the left side if equally close. side = 0 if c["start"] <= anchor_pos else 1 return (distance, side) preferred = has_wan_candidates or candidates best = min(preferred, key=score) return amount_num_to_text(best["num"]) def split_segments(text: str) -> list[str]: parts = re.split(r"[,,;;。]", text) return [p.strip() for p in parts if p.strip()] def normalize_insurance_year(value: Any) -> str | None: if value is None: return None s = str(value).strip() if s in {"3", "3年", "3年交", "三年", "三年交"}: return "3" if s in {"5", "5年", "5年交", "五年", "五年交"}: return "5" return None def normalize_insurance_year_choices(value: Any) -> dict[str, str]: if not isinstance(value, dict): return {} out: dict[str, str] = {} for k, v in value.items(): key = str(k).strip() year = normalize_insurance_year(v) if key and year: out[key] = year return out def build_insurance_choice_key( *, source_line: str, raw_text: str, branch: str, amount: str, page: str, item_index: int, ) -> str: seed = f"{source_line}|{raw_text}|{branch}|{amount}|{page}|{item_index}" return hashlib.sha1(seed.encode("utf-8")).hexdigest()[:16] def pick_insurance_year_for_record( *, choice_key: str, source_line: str, raw_text: str, branch: str, amount: str, year_map: dict[str, str], default_year: str | None, ) -> str | None: candidates = [ choice_key, source_line, raw_text, f"{source_line}|{amount}", f"{raw_text}|{amount}", f"{branch}|{amount}", ] for key in candidates: if key and key in year_map: y = normalize_insurance_year(year_map.get(key)) if y in {"3", "5"}: return y return normalize_insurance_year(default_year) def make_product(type_keyword: str, page: str, amount: str | None, **kwargs: Any) -> dict[str, Any]: item = { "type": type_keyword, "page": page, "amount": amount, } item.update(kwargs) return item def detect_products( line: str, config: dict[str, Any], insurance_year_choice: str | None, ) -> list[dict[str, Any]]: insurance_cfg = config.get("insurance_handling", {}) insurance_page = str(insurance_cfg.get("page", "page_3")) if isinstance(insurance_cfg, dict) else "page_3" products: list[dict[str, Any]] = [] seen: set[str] = set() segments = split_segments(line) if not segments: segments = [line] line_candidates = extract_amount_candidates(line) seg_search_pos = 0 def locate_segment_start(seg_text: str) -> int: nonlocal seg_search_pos if not seg_text: return 0 pos = line.find(seg_text, seg_search_pos) if pos < 0: pos = line.find(seg_text) if pos >= 0: seg_search_pos = pos + len(seg_text) return pos return 0 def pick_segment_amount( seg_text: str, seg_candidates: list[dict[str, Any]], seg_anchor: int | None, seg_start: int, ) -> str | None: amount = pick_amount_from_candidates(seg_text, seg_candidates, seg_anchor) if amount: return amount if not line_candidates: return None line_anchor = None if isinstance(seg_anchor, int) and seg_anchor >= 0: line_anchor = seg_start + seg_anchor return pick_amount_from_candidates(line, line_candidates, line_anchor) for seg in segments: seg_candidates = extract_amount_candidates(seg) seg_start = locate_segment_start(seg) # 1) Insurance (always mapped to 期交 page) if "保险" in seg or "期交" in seg or re.search(r"(?:3|5|三|五)\s*年交", seg): year = None if re.search(r"(?:3|三)\s*年交", seg): year = "3" elif re.search(r"(?:5|五)\s*年交", seg): year = "5" elif insurance_year_choice in {"3", "5"}: year = insurance_year_choice insurance_anchor = seg.find("保险") if insurance_anchor < 0: insurance_anchor = 0 amount = pick_segment_amount(seg, seg_candidates, insurance_anchor, seg_start) if year == "3": type_keyword = "3年交" elif year == "5": type_keyword = "5年交" else: type_keyword = "保险" key = f"{type_keyword}|{insurance_page}|{amount}" if key not in seen: products.append( make_product( type_keyword, insurance_page, amount, is_insurance=True, needs_insurance_year=(year is None), ) ) seen.add(key) # 2) Wealth/Fund/Asset if "理财" in seg: amount = pick_segment_amount(seg, seg_candidates, seg.find("理财"), seg_start) key = f"理财|page_5|{amount}" if key not in seen: products.append(make_product("理财", "page_5", amount)) seen.add(key) if "基金" in seg: amount = pick_segment_amount(seg, seg_candidates, seg.find("基金"), seg_start) key = f"基金|page_7|{amount}" if key not in seen: products.append(make_product("基金", "page_7", amount)) seen.add(key) if "资管" in seg: amount = pick_segment_amount(seg, seg_candidates, seg.find("资管"), seg_start) key = f"资管|page_6|{amount}" if key not in seen: products.append(make_product("资管", "page_6", amount)) seen.add(key) # 3) Fixed-term deposit variants for term in TERM_DEFS: rgx = term["regex"] for m in rgx.finditer(seg): # Avoid interpreting "2年理财" as one-year term. if term["page"] == "page_2" and "理财" in seg and "定期" not in seg and "存" not in seg: continue amount = pick_segment_amount(seg, seg_candidates, m.start(), seg_start) key = f"{term['type']}|{term['page']}|{amount}" if key not in seen: products.append(make_product(str(term["type"]), str(term["page"]), amount)) seen.add(key) return products def is_demand_deposit_only_line(line: str) -> bool: # "活期"类关键词但没有明确期限(3/6个月、半年、1年、一年)时,默认不生成。 demand_markers = [ "微信提现", "微信支付宝提现", "支付宝提现", "挖转", "他行", "揽收现金", "存量提升", "揽收商户", "商户提现", "揽收彩礼", ] explicit_product_markers = [ "保险", "年交", "理财", "基金", "资管", "趸交", "三个月", "3个月", "六个月", "6个月", "半年", "一年", "1年", ] if not extract_amount_candidates(line): return False if not any(x in line for x in demand_markers): return False if any(x in line for x in explicit_product_markers): return False return True def key_for_record(record: dict[str, Any], key_fields: list[str]) -> str: line_serial = str(record.get("line_serial", "")).strip() if line_serial: line_product_index = str(record.get("line_product_index", "")).strip() or "1" sig = str(record.get("line_signature", "")).strip() if not sig: sig = line_signature(str(record.get("raw_text", ""))) parts = [f"line:{line_serial}", f"item:{line_product_index}"] if sig: parts.append(f"sig:{sig}") return "|".join(parts) return "|".join(str(record.get(k, "")).strip() for k in key_fields) def signature_key_for_record(record: dict[str, Any]) -> str: line_product_index = str(record.get("line_product_index", "")).strip() or "1" sig = str(record.get("line_signature", "")).strip() if not sig: raw_text = str(record.get("raw_text", "")).strip() if raw_text: sig = line_signature(raw_text) else: source_line = str(record.get("source_line", "")).strip() if source_line: normalized_source = normalize_line(source_line, r"^\s*\d+\s*(?:[、,,..\))]\s*)?") sig = line_signature(normalized_source) if not sig: return "" return f"sig:{sig}|item:{line_product_index}" def render_output_filename(config: dict[str, Any], record: dict[str, Any], index: int) -> str: settings = config.get("output_settings", {}) pattern = "喜报_{branch}_{index}.png" if isinstance(settings, dict): pattern = str(settings.get("output_pattern", pattern)) try: name = pattern.format(branch=record.get("branch", "未知网点"), index=index) except Exception: name = f"喜报_{record.get('branch', '未知网点')}_{index}.png" if not name.lower().endswith(".png"): name = f"{name}.png" return name def normalize_branch_value(branch: Any, config: dict[str, Any]) -> str: b = str(branch).strip() alias = config.get("branches", {}).get("alias", {}) if isinstance(alias, dict) and b in alias: return str(alias.get(b, b)).strip() return b def infer_page_from_type(type_keyword: str, config: dict[str, Any]) -> str: t = str(type_keyword).strip() if not t: return "" type_map = config.get("type_matching", {}) if isinstance(type_map, dict): direct = str(type_map.get(t, "")).strip() if direct: return direct pairs = [(str(k), str(v)) for k, v in type_map.items()] pairs.sort(key=lambda x: len(x[0]), reverse=True) for k, v in pairs: if k and k in t and v: return v.strip() for term in TERM_DEFS: if t == str(term.get("type", "")): return str(term.get("page", "")).strip() return "" def apply_record_overrides( record: dict[str, Any], updates: dict[str, Any], config: dict[str, Any], ) -> dict[str, Any]: out = dict(record) if not isinstance(updates, dict): return out if "branch" in updates: branch = normalize_branch_value(updates.get("branch", ""), config) if branch: out["branch"] = branch if "amount" in updates: amount = normalize_amount_text(updates.get("amount", "")) if amount: out["amount"] = amount if "type" in updates: t = str(updates.get("type", "")).strip() if t: out["type"] = t if "page" not in updates: page = infer_page_from_type(t, config) if page: out["page"] = page if "page" in updates: page = str(updates.get("page", "")).strip() if page: out["page"] = page if "status" in updates: status = str(updates.get("status", "")).strip() if status: out["status"] = normalize_status_value(status, config) return out def validate_record_for_generation(record: dict[str, Any], config: dict[str, Any]) -> None: branch = str(record.get("branch", "")).strip() amount = normalize_amount_text(record.get("amount", "")) type_keyword = str(record.get("type", "")).strip() page = str(record.get("page", "")).strip() status = str(record.get("status", "")).strip() if not branch: raise ValueError("branch is required") allowed = config.get("branches", {}).get("allowed", []) if isinstance(allowed, list) and allowed: allow_set = {str(x).strip() for x in allowed} if branch not in allow_set: raise ValueError(f"branch not allowed: {branch}") if not amount: raise ValueError("amount is required") record["amount"] = amount if not type_keyword: raise ValueError("type is required") if not page: page = infer_page_from_type(type_keyword, config) if page: record["page"] = page page = str(record.get("page", "")).strip() if not page: raise ValueError("page is required") pages = config.get("pages", {}) if isinstance(pages, dict) and page not in pages: raise ValueError(f"invalid page: {page}") if not status: status = str(config.get("status_extraction", {}).get("fallback", "成功营销")).strip() or "成功营销" valid_status = get_valid_status_list(config) if valid_status and status not in valid_status: raise ValueError(f"invalid status: {status}") record["status"] = normalize_status_value(status, config) def match_manual_rule(rule: dict[str, Any], source_line: str, normalized_line: str) -> bool: if not isinstance(rule, dict) or not bool(rule.get("enabled", True)): return False keyword = str(rule.get("keyword", "")).strip() if not keyword: return False mode = str(rule.get("match_mode", "normalized")).strip().lower() if mode == "source": return keyword in source_line if mode == "both": return keyword in source_line or keyword in normalized_line return keyword in normalized_line def apply_manual_rules_to_record( record: dict[str, Any], *, source_line: str, normalized_line: str, rules: list[dict[str, Any]], config: dict[str, Any], ) -> tuple[dict[str, Any], list[str]]: if not rules: return record, [] out = dict(record) hit_ids: list[str] = [] for rule in rules: if not match_manual_rule(rule, source_line, normalized_line): continue updates = rule.get("updates", {}) if not isinstance(updates, dict): continue try: out = apply_record_overrides(out, updates, config) except Exception: continue hit_ids.append(str(rule.get("id", ""))) return out, [x for x in hit_ids if x] def save_or_update_manual_rule( *, keyword: str, updates: dict[str, Any], note: str = "", match_mode: str = "normalized", ) -> dict[str, Any]: kw = re.sub(r"\s+", "", str(keyword).strip()) if not kw: raise ValueError("rule keyword is required") if not isinstance(updates, dict) or not updates: raise ValueError("rule updates is required") normalized_updates: dict[str, Any] = {} for k in ("branch", "type", "page", "status", "amount"): if k in updates: v = str(updates.get(k, "")).strip() if v: normalized_updates[k] = v if not normalized_updates: raise ValueError("rule updates is empty") mode = str(match_mode or "normalized").strip().lower() if mode not in {"source", "normalized", "both"}: mode = "normalized" rules = load_manual_rules() now = datetime.now().isoformat(timespec="seconds") for rule in rules: if not isinstance(rule, dict): continue if str(rule.get("keyword", "")).strip() != kw: continue if str(rule.get("match_mode", "normalized")).strip().lower() != mode: continue if rule.get("updates") != normalized_updates: continue rule["updated_at"] = now if note: rule["note"] = note save_manual_rules(rules) return dict(rule) item = { "id": uuid.uuid4().hex[:12], "keyword": kw, "match_mode": mode, "updates": normalized_updates, "enabled": True, "created_at": now, "updated_at": now, } if note: item["note"] = note rules.append(item) save_manual_rules(rules) return dict(item) def infer_correction_rule_keyword( *, source_line: str, normalized_line: str, corrected_record: dict[str, Any], ) -> str: status = str(corrected_record.get("status", "")).strip() if status and (status in normalized_line or status in source_line): return status t = str(corrected_record.get("type", "")).strip() if t and (t in normalized_line or t in source_line) and len(t) <= 8: return t if normalized_line: return normalized_line[:24] return re.sub(r"\s+", "", source_line)[:24] def parse_records( raw_text: str, config: dict[str, Any], history: list[dict[str, Any]], insurance_year_choice: str | None = None, insurance_year_choices: dict[str, str] | None = None, ) -> dict[str, Any]: relay_cfg = config.get("relay_handling", {}) parse_rules = relay_cfg.get("parse_rules", {}) if isinstance(relay_cfg, dict) else {} trigger = str(relay_cfg.get("trigger_keyword", "#接龙")) line_pattern = str(parse_rules.get("line_pattern", r"^\d+、\s*")) skip_lines = parse_rules.get("skip_lines", []) skip_lines = [str(x) for x in skip_lines] if isinstance(skip_lines, list) else [] has_trigger = trigger in raw_text lines = [line.strip() for line in raw_text.splitlines() if line.strip()] filtered_lines: list[dict[str, str]] = [] skipped: list[dict[str, str]] = [] for line in lines: source_line = line.strip() if trigger and trigger in source_line and source_line == trigger: continue normalized = normalize_line(source_line, line_pattern) if not normalized: continue if is_date_header_line(normalized): skipped.append({"line": source_line, "reason": "skip_line_rule"}) continue if any(token and token in normalized for token in skip_lines): skipped.append({"line": source_line, "reason": "skip_line_rule"}) continue filtered_lines.append( { "source_line": source_line, "normalized_line": normalized, "line_serial": extract_line_serial(source_line, line_pattern), } ) if not filtered_lines and raw_text.strip(): source_line = raw_text.strip() normalized_single = normalize_line(source_line, line_pattern) if normalized_single: filtered_lines = [ { "source_line": source_line, "normalized_line": normalized_single, "line_serial": extract_line_serial(source_line, line_pattern), } ] branches_cfg = config.get("branches", {}) allowed_branches = set(str(x) for x in branches_cfg.get("allowed", [])) if isinstance(branches_cfg, dict) else set() dedup_cfg = relay_cfg.get("dedup", {}) if isinstance(relay_cfg, dict) else {} key_fields = dedup_cfg.get("key_fields", ["branch", "amount", "type"]) if not isinstance(key_fields, list) or not key_fields: key_fields = ["branch", "amount", "type"] key_fields = [str(k) for k in key_fields] insurance_year_map = dict(insurance_year_choices) if isinstance(insurance_year_choices, dict) else {} history_lookup = build_history_lookup(history, key_fields) history_keys = set(history_lookup.keys()) history_signature_lookup = build_history_signature_lookup(history) history_signature_keys = set(history_signature_lookup.keys()) batch_keys: set[str] = set() batch_signature_keys: set[str] = set() suppressed_lookup = build_skip_suppression_lookup(list_skip_suppressions()) manual_rules = load_manual_rules() records: list[dict[str, Any]] = [] new_records: list[dict[str, Any]] = [] pending_insurance_records: list[dict[str, Any]] = [] duplicate_records: list[dict[str, Any]] = [] for line_item in filtered_lines: source_line = str(line_item.get("source_line", "")) normalized_line = str(line_item.get("normalized_line", "")) line_serial = str(line_item.get("line_serial", "")).strip() normalized_signature = line_signature(normalized_line) branch_raw, branch = extract_branch(normalized_line, config) if not branch: skipped.append({"line": source_line, "reason": "branch_not_found"}) continue if allowed_branches and branch not in allowed_branches: skipped.append({"line": source_line, "reason": f"branch_not_allowed:{branch}"}) continue status = normalize_status_value(extract_status(normalized_line, config), config) products = detect_products(normalized_line, config, insurance_year_choice) if not products: if is_demand_deposit_only_line(normalized_line): skipped.append({"line": source_line, "reason": "demand_deposit_not_generate"}) else: skipped.append({"line": source_line, "reason": "type_not_found"}) continue for product_index, product in enumerate(products, start=1): amount = str(product.get("amount", "")).strip() if not amount: skipped.append({"line": source_line, "reason": "amount_not_found"}) continue type_keyword = str(product.get("type", "")).strip() page = str(product.get("page", "")).strip() if not type_keyword or not page: skipped.append({"line": source_line, "reason": "type_not_found"}) continue record = { "source_line": source_line, "raw_text": normalized_line, "branch_raw": branch_raw, "branch": branch, "amount": amount, "type": type_keyword, "page": page, "status": status, "needs_insurance_year": bool(product.get("needs_insurance_year", False)), } if line_serial: record["line_serial"] = line_serial if normalized_signature: record["line_signature"] = normalized_signature record["line_product_index"] = str(product_index) if record["needs_insurance_year"]: choice_key = build_insurance_choice_key( source_line=source_line, raw_text=normalized_line, branch=branch, amount=amount, page=page, item_index=product_index, ) record["insurance_choice_key"] = choice_key selected_year = pick_insurance_year_for_record( choice_key=choice_key, source_line=source_line, raw_text=normalized_line, branch=branch, amount=amount, year_map=insurance_year_map, default_year=insurance_year_choice, ) if selected_year in {"3", "5"}: record["type"] = f"{selected_year}年交" record["needs_insurance_year"] = False record["insurance_year"] = selected_year record, hit_rule_ids = apply_manual_rules_to_record( record, source_line=source_line, normalized_line=normalized_line, rules=manual_rules, config=config, ) record["amount"] = normalize_amount_text(record.get("amount", "")) record["status"] = normalize_status_value(str(record.get("status", "")), config) if hit_rule_ids: record["applied_rule_ids"] = hit_rule_ids if record["needs_insurance_year"]: forced_year = normalize_insurance_year(record.get("type")) if forced_year in {"3", "5"}: record["type"] = f"{forced_year}年交" record["needs_insurance_year"] = False record["insurance_year"] = forced_year if not str(record.get("amount", "")).strip(): skipped.append({"line": source_line, "reason": "amount_not_found"}) continue if not str(record.get("page", "")).strip(): inferred_page = infer_page_from_type(str(record.get("type", "")), config) if inferred_page: record["page"] = inferred_page pages = config.get("pages", {}) if isinstance(pages, dict) and str(record.get("page", "")).strip() not in pages: skipped.append({"line": source_line, "reason": "type_not_found"}) continue if allowed_branches and str(record.get("branch", "")).strip() not in allowed_branches: skipped.append({"line": source_line, "reason": f"branch_not_allowed:{record.get('branch', '')}"}) continue records.append(record) dedup_key = key_for_record(record, key_fields) record["dedup_key"] = dedup_key signature_key = signature_key_for_record(record) if signature_key: record["signature_key"] = signature_key history_record = history_lookup.get(dedup_key) if history_record is None and signature_key: history_record = history_signature_lookup.get(signature_key) if history_record is not None or dedup_key in history_keys or (signature_key and signature_key in history_signature_keys): dup = dict(record) dup["duplicate_reason"] = "history_duplicate" attach_duplicate_download_info(dup, history_record, config) duplicate_records.append(dup) continue if dedup_key in batch_keys or (signature_key and signature_key in batch_signature_keys): dup = dict(record) dup["duplicate_reason"] = "input_duplicate" duplicate_records.append(dup) continue # Insurance without explicit 3/5 year should ask user only when truly pending. # If this line was manually suppressed in skip rules, keep it out of prompt queue. if record["needs_insurance_year"]: if is_skip_item_suppressed(source_line, "insurance_year_pending", suppressed_lookup): batch_keys.add(dedup_key) if signature_key: batch_signature_keys.add(signature_key) continue pending = dict(record) pending["insurance_options"] = ["3", "5"] pending_insurance_records.append(pending) batch_keys.add(dedup_key) if signature_key: batch_signature_keys.add(signature_key) continue batch_keys.add(dedup_key) if signature_key: batch_signature_keys.add(signature_key) new_records.append(record) for i, rec in enumerate(new_records, start=1): rec["output_file"] = render_output_filename(config, rec, i) visible_skipped: list[dict[str, str]] = [] for item in skipped: line_text = str(item.get("line", "")) reason_text = str(item.get("reason", "")) if is_skip_item_suppressed(line_text, reason_text, suppressed_lookup): continue visible_skipped.append(item) summary = { "input_lines": len(lines), "parsed": len(records), "new": len(new_records), "duplicate": len(duplicate_records), "skipped": len(visible_skipped), "insurance_pending": len(pending_insurance_records), } return { "has_trigger": has_trigger, "records": records, "new_records": new_records, "pending_insurance_records": pending_insurance_records, "duplicate_records": duplicate_records, "skipped": visible_skipped, "summary": summary, "dedup_key_fields": key_fields, "needs_insurance_choice": len(pending_insurance_records) > 0, } def append_new_history( history_path: Path, current_history: list[dict[str, Any]], records: list[dict[str, Any]], key_fields: list[str], ) -> dict[str, Any]: existing = {key_for_record(r, key_fields) for r in current_history} to_add = [] now = datetime.now().isoformat(timespec="seconds") for rec in records: if not isinstance(rec, dict): continue k = key_for_record(rec, key_fields) if not k or k in existing: continue item = dict(rec) item.pop("download_token", None) item.pop("download_url", None) item["created_at"] = now to_add.append(item) existing.add(k) merged = current_history + to_add save_history(history_path, merged) return { "added": len(to_add), "total": len(merged), } def upsert_history_records( history_path: Path, current_history: list[dict[str, Any]], records: list[dict[str, Any]], key_fields: list[str], ) -> dict[str, Any]: merged = [x for x in current_history if isinstance(x, dict)] index_map: dict[str, int] = {} for i, item in enumerate(merged): k = key_for_record(item, key_fields) if k: index_map[k] = i now = datetime.now().isoformat(timespec="seconds") added = 0 replaced = 0 for rec in records: if not isinstance(rec, dict): continue item = dict(rec) item.pop("download_token", None) item.pop("download_url", None) item["created_at"] = now k = key_for_record(item, key_fields) if not k: continue if k in index_map: merged[index_map[k]] = item replaced += 1 else: index_map[k] = len(merged) merged.append(item) added += 1 save_history(history_path, merged) return { "added": added, "replaced": replaced, "total": len(merged), } def page_to_slide_index(page_name: str) -> int: m = re.match(r"^page_(\d+)$", page_name) if not m: raise ValueError(f"invalid page name: {page_name}") return int(m.group(1)) def safe_filename(name: str) -> str: s = re.sub(r"[\\/:*?\"<>|\r\n]+", "_", name).strip() s = re.sub(r"\s+", "_", s) return s or f"file_{uuid.uuid4().hex[:8]}" def amount_to_digits(amount: str) -> str: v = floor_amount_number(amount) if v is None: t = str(amount).strip() return t.replace("万元", "").replace("万", "") return str(v) def pick_paragraph_index(paragraph_count: int, configured_index: int) -> int: if paragraph_count <= 0: return 0 if 0 <= configured_index < paragraph_count: return configured_index # Compatible with 1-based config values. one_based = configured_index - 1 if 0 <= one_based < paragraph_count: return one_based return 0 def paragraph_plain_text(para: Any) -> str: if getattr(para, "runs", None): return "".join((run.text or "") for run in para.runs) return str(getattr(para, "text", "") or "") def set_paragraph_text(para: Any, text: str) -> None: if getattr(para, "runs", None): para.runs[0].text = text for run in para.runs[1:]: run.text = "" return para.text = text def paragraph_has_branch_marker(para: Any) -> bool: txt = paragraph_plain_text(para).strip() return bool(txt) and ("营业所" in txt) def replace_company_placeholder_text(text: str, company_replacement: str) -> str: company_text = str(company_replacement or "").strip() if not company_text: return str(text or "") return re.sub(r"([XxXx**]+)\s*(?=分公司)", company_text, str(text or ""), count=1) def build_branch_text( original_text: str, branch: str, fallback_text: str, company_replacement: str = "", ) -> str: text = str(original_text or "") if not text.strip(): return fallback_text # Some templates use placeholder tokens like "XX/**" before "分公司". # Replace them first to avoid rendering "XX分公司". text = replace_company_placeholder_text(text, company_replacement) # Keep the template prefix (e.g. "永州市道县分公司"), only replace the branch part. m = re.search(r"(分公司)([^|,。,;;\s]*)营业所", text) if m: return f"{text[:m.start(2)]}{branch}营业所{text[m.end():]}" m = re.search(r"([^|,。,;;\s]*)营业所", text) if m: return f"{text[:m.start(1)]}{branch}营业所{text[m.end():]}" return fallback_text def replace_branch_in_slide(slide: Any, branch: str, config: dict[str, Any]) -> None: branch_cfg = config.get("replace_algorithm", {}).get("branch", {}) location_cfg = config.get("branches", {}).get("template_location", {}) shape_index = int(branch_cfg.get("shape_index", location_cfg.get("shape_index", 3))) paragraph_index = int(branch_cfg.get("paragraph_index", location_cfg.get("paragraph_index", 1))) fmt = str(branch_cfg.get("format", location_cfg.get("display_format", "{branch}营业所"))) company_replacement = str( branch_cfg.get("company_replacement", location_cfg.get("company_replacement", "")) ).strip() new_text = fmt.format(branch=branch) target_para = None fallback_para = None target_shape = None cfg_paragraphs = [] if 0 <= shape_index < len(slide.shapes): cfg_shape = slide.shapes[shape_index] if getattr(cfg_shape, "has_text_frame", False): cfg_paragraphs = cfg_shape.text_frame.paragraphs if cfg_paragraphs: p_index = pick_paragraph_index(len(cfg_paragraphs), paragraph_index) fallback_para = cfg_paragraphs[p_index] if paragraph_has_branch_marker(cfg_paragraphs[p_index]): target_para = cfg_paragraphs[p_index] target_shape = cfg_shape else: for para in cfg_paragraphs: if paragraph_has_branch_marker(para): target_para = para target_shape = cfg_shape break if target_para is None: for shape in slide.shapes: if not getattr(shape, "has_text_frame", False): continue paragraphs = shape.text_frame.paragraphs for para in paragraphs: if paragraph_has_branch_marker(para): target_para = para target_shape = shape break if target_para is not None: break if target_para is not None: original_text = paragraph_plain_text(target_para) set_paragraph_text( target_para, build_branch_text(original_text, branch, new_text, company_replacement), ) if company_replacement and target_shape is not None and getattr(target_shape, "has_text_frame", False): for para in target_shape.text_frame.paragraphs: old_text = paragraph_plain_text(para) new_company_text = replace_company_placeholder_text(old_text, company_replacement) if new_company_text != old_text: set_paragraph_text(para, new_company_text) return if fallback_para is not None: set_paragraph_text(fallback_para, new_text) def replace_amount_in_slide(slide: Any, shape_index: int, amount: str) -> None: if shape_index < 0 or shape_index >= len(slide.shapes): return shape = slide.shapes[shape_index] if not getattr(shape, "has_text_frame", False): return amount_digits = amount_to_digits(amount) candidates: list[tuple[Any, int, str]] = [] paragraphs = shape.text_frame.paragraphs for para in paragraphs: runs = list(getattr(para, "runs", []) or []) for idx, run in enumerate(runs): t = str(getattr(run, "text", "") or "").strip() if re.fullmatch(r"\d+(?:\.\d+)?", t) or re.fullmatch(r"[\**]+", t): candidates.append((para, idx, t)) if not candidates: return def next_nonempty_text(para: Any, start_idx: int) -> str: runs = list(getattr(para, "runs", []) or []) for j in range(start_idx + 1, len(runs)): txt = str(getattr(runs[j], "text", "") or "").strip() if txt: return txt return "" def prev_nonempty_text(para: Any, start_idx: int) -> str: runs = list(getattr(para, "runs", []) or []) for j in range(start_idx - 1, -1, -1): txt = str(getattr(runs[j], "text", "") or "").strip() if txt: return txt return "" preferred: list[tuple[Any, int, str]] = [] for para, idx, _ in candidates: nxt = next_nonempty_text(para, idx) prv = prev_nonempty_text(para, idx) # Prefer the amount token that is adjacent to "万/万元". if ("万" in nxt) or ("万元" in nxt) or ("万" in prv) or ("万元" in prv): preferred.append((para, idx, "")) targets = preferred if preferred else [candidates[-1]] for para, idx, _ in targets: runs = list(getattr(para, "runs", []) or []) if 0 <= idx < len(runs): runs[idx].text = amount_digits def replace_status_in_slide( slide: Any, shape_index: int, status: str, page_name: str, config: dict[str, Any], ) -> None: if shape_index < 0 or shape_index >= len(slide.shapes): return shape = slide.shapes[shape_index] if not getattr(shape, "has_text_frame", False): return normalized_status = normalize_status_value(status, config) page_cfg = config.get("pages", {}).get(page_name, {}) has_bracket = bool(page_cfg.get("has_bracket", False)) if isinstance(page_cfg, dict) else False target_text = f"({normalized_status})" if has_bracket else normalized_status valid_status = get_valid_status_list(config) paragraphs = shape.text_frame.paragraphs if not paragraphs: shape.text_frame.text = target_text return target_para = None for para in paragraphs: para_text = "".join(run.text for run in para.runs).strip() if para.runs else para.text.strip() if not para_text: continue if any(s in para_text for s in valid_status) or ("成功营销" in para_text) or ("(" in para_text and ")" in para_text): target_para = para break if target_para is None: for para in paragraphs: para_text = "".join(run.text for run in para.runs).strip() if para.runs else para.text.strip() if para_text: target_para = para break if target_para is None: target_para = paragraphs[0] if target_para.runs: replaced = False # 优先替换已存在的状态 run,保持模板字体/字号完全不变。 for run in target_para.runs: raw = run.text or "" trimmed = raw.strip() if not raw: continue if trimmed in valid_status or trimmed == "成功营销": run.text = raw.replace(trimmed, normalized_status, 1) replaced = True break if not replaced and has_bracket: for run in target_para.runs: raw = run.text or "" if "(" in raw and ")" in raw: run.text = re.sub(r"([^)]*)", f"({normalized_status})", raw, count=1) replaced = True break if not replaced: target_para.runs[0].text = target_text for run in target_para.runs[1:]: run.text = "" else: target_para.text = target_text if has_bracket: target_para.alignment = PP_ALIGN.CENTER shape.text_frame.word_wrap = False def replace_page_display_name_in_slide(slide: Any, page_name: str, config: dict[str, Any]) -> None: pages = config.get("pages", {}) if not isinstance(pages, dict): return page_cfg = pages.get(page_name, {}) if not isinstance(page_cfg, dict): return display_name = str(page_cfg.get("display_name", "")).strip() if not display_name: return aliases = page_cfg.get("display_aliases", []) alias_list: list[str] = [display_name] if isinstance(aliases, list): alias_list.extend([str(x).strip() for x in aliases if str(x).strip()]) if page_name == "page_5": # 兼容旧模板文案。 alias_list.extend(["两三期理财", "两三年期理财"]) alias_list = [x for x in dict.fromkeys(alias_list) if x] for shape in slide.shapes: if not getattr(shape, "has_text_frame", False): continue paragraphs = shape.text_frame.paragraphs for para in paragraphs: raw_text = paragraph_plain_text(para) compact_text = normalize_skip_line(raw_text) if not compact_text: continue for alias in alias_list: alias_compact = normalize_skip_line(alias) if not alias_compact or alias_compact not in compact_text: continue if compact_text == alias_compact: set_paragraph_text(para, display_name) elif alias in raw_text: set_paragraph_text(para, raw_text.replace(alias, display_name, 1)) elif len(compact_text) <= 24: set_paragraph_text(para, compact_text.replace(alias_compact, display_name, 1)) else: continue # 避免“多一个字后自动折行”。 shape.text_frame.word_wrap = False break def keep_only_target_slide(prs: Presentation, keep_index: int) -> bool: try: sld_id_list = prs.slides._sldIdLst # type: ignore[attr-defined] total = len(prs.slides) if keep_index < 0 or keep_index >= total: return False for idx in range(total - 1, -1, -1): if idx == keep_index: continue rel_id = sld_id_list[idx].rId prs.part.drop_rel(rel_id) del sld_id_list[idx] return True except Exception: return False def is_single_slide_output_enabled(config: dict[str, Any]) -> bool: perf_cfg = config.get("performance", {}) if isinstance(perf_cfg, dict): return bool(perf_cfg.get("single_slide_output", True)) return True def resolve_generation_strategy(config: dict[str, Any], total_records: int) -> str: perf_cfg = config.get("performance", {}) if not isinstance(perf_cfg, dict): return "legacy" strategy = str(perf_cfg.get("generation_strategy", "page_template_cache")).strip().lower() if strategy not in {"legacy", "page_template_cache"}: strategy = "page_template_cache" if strategy == "legacy": return "legacy" min_records = perf_cfg.get("template_cache_min_records", 2) try: min_records_int = int(min_records) except Exception: min_records_int = 2 if total_records < max(1, min_records_int): return "legacy" if not is_single_slide_output_enabled(config): return "legacy" return "page_template_cache" def _resolve_perf_section(config: dict[str, Any]) -> dict[str, Any]: perf_cfg = config.get("performance", {}) return perf_cfg if isinstance(perf_cfg, dict) else {} def resolve_single_generation_mode(config: dict[str, Any]) -> bool: perf_cfg = _resolve_perf_section(config) return bool(perf_cfg.get("single_generation_mode", True)) def resolve_build_workers(config: dict[str, Any], total_records: int) -> int: perf_cfg = _resolve_perf_section(config) raw_workers = perf_cfg.get("max_build_workers", 1) try: workers = int(raw_workers) except Exception: workers = 1 workers = max(1, min(6, workers)) return max(1, min(workers, total_records, (os.cpu_count() or 2))) def resolve_extract_workers(config: dict[str, Any]) -> int: perf_cfg = _resolve_perf_section(config) raw_workers = perf_cfg.get("max_extract_workers", 1) try: workers = int(raw_workers) except Exception: workers = 1 return max(1, min(6, workers, (os.cpu_count() or 2))) def resolve_memory_reclaim_options(config: dict[str, Any]) -> dict[str, bool]: perf_cfg = _resolve_perf_section(config) reclaim_cfg = perf_cfg.get("memory_reclaim", {}) if not isinstance(reclaim_cfg, dict): reclaim_cfg = {} enabled = bool(reclaim_cfg.get("enabled", True)) return { "enabled": enabled, "gc_collect": enabled and bool(reclaim_cfg.get("gc_collect", True)), "malloc_trim": enabled and bool(reclaim_cfg.get("malloc_trim", True)), } def _try_malloc_trim() -> bool: global _MALLOC_TRIM_FN if os.name != "posix": return False try: if _MALLOC_TRIM_FN is None: libc = ctypes.CDLL("libc.so.6") fn = libc.malloc_trim fn.argtypes = [ctypes.c_size_t] fn.restype = ctypes.c_int _MALLOC_TRIM_FN = fn return bool(_MALLOC_TRIM_FN(0)) except Exception: return False def reclaim_runtime_memory(config: dict[str, Any]) -> None: opts = resolve_memory_reclaim_options(config) if not opts["enabled"]: return if opts["gc_collect"]: gc.collect() if opts["malloc_trim"]: _try_malloc_trim() def build_page_template_cache( template_path: Path, page_names: set[str], ) -> dict[str, bytes]: cache: dict[str, bytes] = {} for page_name in sorted(page_names): prs = Presentation(str(template_path)) slide_index = page_to_slide_index(page_name) if slide_index < 0 or slide_index >= len(prs.slides): raise RuntimeError(f"slide index out of range for page={page_name}") if not keep_only_target_slide(prs, slide_index): raise RuntimeError(f"failed to prepare single-slide template for page={page_name}") buffer = io.BytesIO() prs.save(buffer) cache[page_name] = buffer.getvalue() return cache def resolve_image_delivery_options(config: dict[str, Any]) -> dict[str, Any]: perf_cfg = config.get("performance", {}) delivery_cfg = perf_cfg.get("image_delivery", {}) if isinstance(perf_cfg, dict) else {} if not isinstance(delivery_cfg, dict): delivery_cfg = {} enabled = bool(delivery_cfg.get("enabled", True)) raw_max_kbps = delivery_cfg.get("max_kbps", 300) try: max_kbps = int(raw_max_kbps) except Exception: max_kbps = 300 max_kbps = max(0, max_kbps) raw_chunk_kb = delivery_cfg.get("chunk_kb", 16) try: chunk_kb = int(raw_chunk_kb) except Exception: chunk_kb = 16 chunk_kb = max(4, min(256, chunk_kb)) if not enabled: max_kbps = 0 return { "enabled": enabled, "max_kbps": max_kbps, "chunk_size": chunk_kb * 1024, } def build_ppt_for_record( template_path: Path, config: dict[str, Any], record: dict[str, Any], out_pptx: Path, page_template_cache: dict[str, bytes] | None = None, ) -> int: page_name = str(record.get("page", "")) cached_payload = page_template_cache.get(page_name) if isinstance(page_template_cache, dict) else None if cached_payload: prs = Presentation(io.BytesIO(cached_payload)) slide_index = 0 else: prs = Presentation(str(template_path)) slide_index = page_to_slide_index(page_name) if slide_index < 0 or slide_index >= len(prs.slides): raise RuntimeError(f"slide index out of range for page={record.get('page')}") slide = prs.slides[slide_index] page_cfg = config.get("pages", {}).get(page_name, {}) amount_shape = int(page_cfg.get("amount_shape", 2)) status_shape = int(page_cfg.get("status_shape", 3)) normalized_status = normalize_status_value(str(record.get("status", "")), config) replace_branch_in_slide(slide, str(record.get("branch", "")), config) replace_amount_in_slide(slide, amount_shape, str(record.get("amount", ""))) replace_status_in_slide(slide, status_shape, normalized_status, page_name, config) replace_page_display_name_in_slide(slide, page_name, config) single_slide_output = is_single_slide_output_enabled(config) export_page_number = slide_index + 1 if not cached_payload and single_slide_output and keep_only_target_slide(prs, slide_index): export_page_number = 1 prs.save(str(out_pptx)) return export_page_number def run_subprocess(args: list[str], timeout: int = 300) -> subprocess.CompletedProcess[str]: return subprocess.run( args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False, timeout=timeout, ) def use_local_office_converter() -> bool: return bool(LOCAL_LIBREOFFICE_BIN and Path(str(LOCAL_LIBREOFFICE_BIN)).exists()) def use_local_pdftoppm() -> bool: return bool(LOCAL_PDFTOPPM_BIN and Path(str(LOCAL_PDFTOPPM_BIN)).exists()) def ensure_converter_image() -> None: global CONVERTER_IMAGE_READY if use_local_office_converter(): CONVERTER_IMAGE_READY = True return if CONVERTER_IMAGE_READY: return with _CONVERTER_LOCK: if CONVERTER_IMAGE_READY: return inspect = run_subprocess(["docker", "image", "inspect", CONVERTER_IMAGE], timeout=60) if inspect.returncode != 0: pull = run_subprocess(["docker", "pull", CONVERTER_IMAGE], timeout=900) if pull.returncode != 0: raise RuntimeError( f"failed to pull docker image {CONVERTER_IMAGE}: {pull.stderr.strip() or pull.stdout.strip()}" ) CONVERTER_IMAGE_READY = True def prewarm_converter_image(background: bool = True) -> None: def _task() -> None: try: ensure_converter_image() print(f"Converter image ready: {CONVERTER_IMAGE}") except Exception as exc: print(f"Converter image prewarm failed: {exc}") if background: t = threading.Thread(target=_task, name="converter-prewarm", daemon=True) t.start() else: _task() def convert_pptx_to_pdf_batch( job_dir: Path, pptx_paths: list[Path], progress_cb: Callable[[int, str, str], None] | None = None, ) -> list[Path]: if not pptx_paths: return [] pdf_paths = [p.with_suffix(".pdf") for p in pptx_paths] total = len(pdf_paths) start_ts = time.time() last_percent = -1 last_done = -1 if use_local_office_converter(): args = [ str(LOCAL_LIBREOFFICE_BIN), "--headless", "--convert-to", "pdf", "--outdir", str(job_dir), ] args.extend([str(p) for p in pptx_paths]) else: ensure_converter_image() uid_gid = f"{os.getuid()}:{os.getgid()}" workdir = str(job_dir) args = [ "docker", "run", "--rm", "--user", uid_gid, "-v", f"{workdir}:/work", CONVERTER_IMAGE, "libreoffice", "--headless", "--convert-to", "pdf", "--outdir", "/work", ] args.extend([f"/work/{p.name}" for p in pptx_paths]) proc = subprocess.Popen( args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, ) while True: ret = proc.poll() done = 0 for p in pdf_paths: if p.exists(): done += 1 # 70~84 之间滚动,避免长时间停在 70% 不动。 percent_by_done = int((done / max(total, 1)) * 14) percent_by_time = min(13, int((time.time() - start_ts) / 2.0)) percent = min(84, 70 + max(percent_by_done, percent_by_time)) if progress_cb and (done != last_done or percent != last_percent): progress_cb(percent, "PPT转PDF", f"已转 {done}/{total}") last_done = done last_percent = percent if ret is not None: break time.sleep(0.35) stdout, stderr = proc.communicate() if proc.returncode != 0: raise RuntimeError(f"pptx->pdf batch convert failed: {stderr.strip() or stdout.strip()}") if progress_cb: progress_cb(84, "PPT转PDF", f"已转 {total}/{total}") missing = [str(p.name) for p in pdf_paths if not p.exists()] if missing: raise RuntimeError(f"pptx->pdf batch convert failed, missing outputs: {', '.join(missing[:5])}") return pdf_paths def extract_pdf_pages_to_png_batch( tasks: list[dict[str, Any]], config: dict[str, Any] | None = None, progress_cb: Callable[[int, str, str], None] | None = None, ) -> None: if not tasks: return first_pdf = Path(str(tasks[0]["pdf_path"])) job_dir = first_pdf.parent local_pdftoppm = use_local_pdftoppm() map_file = job_dir / "_extract_map.nul" with map_file.open("wb") as f: for t in tasks: if local_pdftoppm: pdf_field = str(Path(str(t["pdf_path"])).resolve()) out_field = str(Path(str(t["png_path"])).with_suffix("").resolve()) else: pdf_field = Path(str(t["pdf_path"])).name out_field = Path(str(t["png_path"])).with_suffix("").name pdf_name = pdf_field.encode("utf-8") page_number = str(int(t["page_number"])).encode("utf-8") out_prefix = out_field.encode("utf-8") f.write(pdf_name + b"\0" + page_number + b"\0" + out_prefix + b"\0") parallel_jobs = resolve_extract_workers(config or {}) if local_pdftoppm: script = ( "set -e; " f"xargs -0 -P {parallel_jobs} -n 3 sh -c " "'pdf=\"$1\"; page=\"$2\"; out=\"$3\"; " f"\"{LOCAL_PDFTOPPM_BIN}\" -f \"$page\" -singlefile -png \"$pdf\" \"$out\"' _ " f"< {str(map_file)}" ) args = ["sh", "-lc", script] else: ensure_converter_image() uid_gid = f"{os.getuid()}:{os.getgid()}" script = ( "set -e; " f"xargs -0 -P {parallel_jobs} -n 3 sh -c " "'pdf=\"$1\"; page=\"$2\"; out=\"$3\"; " "pdftoppm -f \"$page\" -singlefile -png \"/work/$pdf\" \"/work/$out\"' _ " "< /work/_extract_map.nul" ) args = [ "docker", "run", "--rm", "--user", uid_gid, "-v", f"{str(job_dir)}:/work", CONVERTER_IMAGE, "sh", "-lc", script, ] proc = subprocess.Popen( args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, ) total = len(tasks) last_done = -1 last_percent = -1 while True: ret = proc.poll() done = 0 for t in tasks: if Path(str(t["png_path"])).exists(): done += 1 percent = min(94, 85 + int((done / max(total, 1)) * 9)) if progress_cb and (done != last_done or percent != last_percent): progress_cb(percent, "PDF转PNG", f"已导出 {done}/{total}") last_done = done last_percent = percent if ret is not None: break time.sleep(0.35) stdout, stderr = proc.communicate() map_file.unlink(missing_ok=True) if proc.returncode != 0: raise RuntimeError(f"pdf->png batch extract failed: {stderr.strip() or stdout.strip()}") missing_png = [str(Path(str(t["png_path"])).name) for t in tasks if not Path(str(t["png_path"])).exists()] if missing_png: raise RuntimeError(f"pdf->png batch extract failed, missing outputs: {', '.join(missing_png[:5])}") def cleanup_download_cache() -> None: now = time.time() to_delete: list[str] = [] for token, meta in DOWNLOAD_CACHE.items(): created_at = float(meta.get("created_at", 0)) if now - created_at > 3600: # 1 hour to_delete.append(token) for token in to_delete: DOWNLOAD_CACHE.pop(token, None) def register_download(file_path: Path, content_type: str = "application/octet-stream") -> str: token = uuid.uuid4().hex with _DOWNLOAD_LOCK: cleanup_download_cache() DOWNLOAD_CACHE[token] = { "file_path": str(file_path), "filename": file_path.name, "content_type": content_type, "created_at": time.time(), } return token def generate_records( records: list[dict[str, Any]], config: dict[str, Any], template_path: Path, output_dir: Path, progress_cb: Callable[[int, str, str], None] | None = None, ) -> dict[str, Any]: output_cfg = config.get("output_settings", {}) auto_cleanup = bool(output_cfg.get("auto_cleanup_pptx", True)) if isinstance(output_cfg, dict) else True job_id = f"job_{now_ts()}_{uuid.uuid4().hex[:6]}" job_dir = output_dir / job_id job_dir.mkdir(parents=True, exist_ok=True) register_active_job_dir(job_dir) try: generated_items: list[dict[str, Any]] = [] download_images: list[dict[str, Any]] = [] tasks: list[dict[str, Any]] = [] page_template_cache: dict[str, bytes] | None = None used_names: set[str] = set() total_records = len(records) generation_strategy = resolve_generation_strategy(config, total_records) if generation_strategy == "page_template_cache" and total_records > 0: page_names = {str(r.get("page", "")).strip() for r in records if str(r.get("page", "")).strip()} if page_names: if progress_cb: progress_cb(14, "准备模板", "构建单页模板缓存") page_template_cache = build_page_template_cache(template_path, page_names) if progress_cb: progress_cb(18, "准备模板", f"已缓存 {len(page_template_cache)} 个页面") if progress_cb: progress_cb(20, "生成PPT", f"0/{total_records}") for i, rec in enumerate(records, start=1): base_name = safe_filename(str(rec.get("output_file") or f"喜报_{rec.get('branch', '未知网点')}_{i}.png")) if not base_name.lower().endswith(".png"): base_name += ".png" final_name = base_name seq = 1 while final_name in used_names: final_name = f"{Path(base_name).stem}_{seq}.png" seq += 1 used_names.add(final_name) ppt_name = f"{Path(final_name).stem}.pptx" ppt_path = job_dir / ppt_name pdf_path = job_dir / f"{Path(final_name).stem}.pdf" png_path = job_dir / final_name tasks.append( { "record": rec, "ppt_path": ppt_path, "pdf_path": pdf_path, "png_path": png_path, "page_number": 1, "png_name": final_name, } ) def _build_one(task: dict[str, Any]) -> None: rec = dict(task.get("record", {})) ppt_path = Path(str(task.get("ppt_path"))) try: export_page_number = build_ppt_for_record( template_path, config, rec, ppt_path, page_template_cache=page_template_cache, ) task["page_number"] = int(export_page_number) except Exception as exc: append_review_log( "generation_error", { "reason": "build_ppt_failed", "source_line": str(rec.get("source_line") or rec.get("raw_text") or ""), "record": rec, "error": str(exc), }, ) raise max_workers = resolve_build_workers(config, total_records) if total_records <= 1 or max_workers <= 1: done = 0 for task in tasks: _build_one(task) done += 1 if progress_cb: pct = 20 + int((done / max(total_records, 1)) * 45) progress_cb(pct, "生成PPT", f"{done}/{total_records}") else: done = 0 with ThreadPoolExecutor(max_workers=max_workers) as ex: futs = [ex.submit(_build_one, t) for t in tasks] for fut in as_completed(futs): fut.result() done += 1 if progress_cb: pct = 20 + int((done / max(total_records, 1)) * 45) progress_cb(pct, "生成PPT", f"{done}/{total_records}") if progress_cb: progress_cb(70, "PPT转PDF", f"已转 0/{total_records}") convert_pptx_to_pdf_batch( job_dir, [Path(str(t["ppt_path"])) for t in tasks], progress_cb=progress_cb, ) if progress_cb: progress_cb(85, "PDF转PNG", "已导出 0/{}".format(total_records)) extract_pdf_pages_to_png_batch(tasks, config=config, progress_cb=progress_cb) for i, t in enumerate(tasks, start=1): rec = dict(t["record"]) ppt_path = Path(str(t["ppt_path"])) pdf_path = Path(str(t["pdf_path"])) png_path = Path(str(t["png_path"])) if auto_cleanup: ppt_path.unlink(missing_ok=True) pdf_path.unlink(missing_ok=True) item = dict(rec) item["png_file"] = str(t["png_name"]) item["job_id"] = job_id item["image_path"] = str(png_path) token = register_download(png_path, content_type="image/png") item["download_token"] = token item["download_url"] = f"/api/download/{token}" generated_items.append(item) download_images.append( { "name": str(t["png_name"]), "download_token": token, "download_url": f"/api/download/{token}", } ) if progress_cb: pct = 95 + int((i / max(total_records, 1)) * 5) progress_cb(pct, "整理下载", f"{i}/{total_records}") return { "job_id": job_id, "job_dir": str(job_dir), "generation_strategy": generation_strategy, "generated": generated_items, "generated_count": len(generated_items), "download_images": download_images, } finally: unregister_active_job_dir(job_dir) reclaim_runtime_memory(config) class XibaoHandler(SimpleHTTPRequestHandler): def __init__(self, *args: Any, **kwargs: Any): super().__init__(*args, directory=str(STATIC_DIR), **kwargs) def _send_json(self, payload: dict[str, Any], status: HTTPStatus = HTTPStatus.OK) -> None: data = json.dumps(payload, ensure_ascii=False).encode("utf-8") self.send_response(status) self.send_header("Content-Type", "application/json; charset=utf-8") self.send_header("Content-Length", str(len(data))) self.end_headers() self.wfile.write(data) def _send_bytes( self, content: bytes, content_type: str, filename: str | None = None, status: HTTPStatus = HTTPStatus.OK, ) -> None: self.send_response(status) self.send_header("Content-Type", content_type) self.send_header("Content-Length", str(len(content))) if filename: encoded = quote(filename) self.send_header("Content-Disposition", f"attachment; filename*=UTF-8''{encoded}") self.end_headers() self.wfile.write(content) def _send_file( self, file_path: Path, content_type: str, filename: str | None = None, status: HTTPStatus = HTTPStatus.OK, max_kbps: int = 0, chunk_size: int = 16 * 1024, ) -> None: total_size = int(file_path.stat().st_size) self.send_response(status) self.send_header("Content-Type", content_type) self.send_header("Content-Length", str(total_size)) if filename: encoded = quote(filename) self.send_header("Content-Disposition", f"attachment; filename*=UTF-8''{encoded}") self.end_headers() max_bytes_per_sec = max(0, int(max_kbps)) * 1024 block_size = max(4096, int(chunk_size)) sent = 0 start_ts = time.perf_counter() with file_path.open("rb") as f: while True: chunk = f.read(block_size) if not chunk: break self.wfile.write(chunk) sent += len(chunk) if max_bytes_per_sec > 0: elapsed = time.perf_counter() - start_ts expected_elapsed = sent / max_bytes_per_sec if expected_elapsed > elapsed: time.sleep(min(expected_elapsed - elapsed, 0.2)) def _read_json_body(self) -> dict[str, Any]: try: content_len = int(self.headers.get("Content-Length", "0")) except ValueError: content_len = 0 body = self.rfile.read(content_len) if content_len > 0 else b"{}" if not body: return {} try: return json.loads(body.decode("utf-8")) except json.JSONDecodeError: raise ValueError("invalid JSON body") def do_GET(self) -> None: # noqa: N802 parsed = urlparse(self.path) if handle_get_routes(self, parsed, globals()): return if parsed.path == "/": self.path = "/index.html" super().do_GET() def do_POST(self) -> None: # noqa: N802 parsed = urlparse(self.path) if handle_post_routes(self, parsed, globals()): return self._send_json({"ok": False, "error": "not found"}, HTTPStatus.NOT_FOUND) def run_server( host: str = "0.0.0.0", port: int = 8787, prewarm: bool = True, prewarm_blocking: bool = False, ) -> None: if prewarm: prewarm_converter_image(background=not prewarm_blocking) cleanup_stop = threading.Event() cleanup_thread = threading.Thread( target=daily_cleanup_loop, args=(cleanup_stop,), name="daily-cleanup", daemon=True, ) cleanup_thread.start() server = ThreadingHTTPServer((host, port), XibaoHandler) print(f"Xibao Web running on http://{host}:{port}") try: server.serve_forever() except KeyboardInterrupt: pass finally: cleanup_stop.set() cleanup_thread.join(timeout=1.0) server.server_close() if __name__ == "__main__": parser = argparse.ArgumentParser(description="Xibao Web server") parser.add_argument("--host", default="0.0.0.0", help="bind host, e.g. 0.0.0.0 or 127.0.0.1") parser.add_argument("--port", type=int, default=8787, help="bind port") parser.add_argument( "--skip-prewarm", action="store_true", help="disable docker converter image prewarm on startup", ) parser.add_argument( "--prewarm-blocking", action="store_true", help="prewarm converter image before serving (blocks startup until ready)", ) args = parser.parse_args() run_server( host=args.host, port=args.port, prewarm=not args.skip_prewarm, prewarm_blocking=args.prewarm_blocking, )