3349 lines
108 KiB
Python
3349 lines
108 KiB
Python
#!/usr/bin/env python3
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
import ctypes
|
||
import gc
|
||
import hashlib
|
||
import io
|
||
import json
|
||
import os
|
||
import re
|
||
import shutil
|
||
import subprocess
|
||
import threading
|
||
import time
|
||
import uuid
|
||
from datetime import datetime, timedelta
|
||
from http import HTTPStatus
|
||
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
|
||
from pathlib import Path
|
||
from typing import Any, Callable
|
||
from urllib.parse import quote, urlparse
|
||
|
||
from pptx.enum.text import PP_ALIGN
|
||
from pptx import Presentation
|
||
from api_get_routes import handle_get_routes
|
||
from api_post_routes import handle_post_routes
|
||
|
||
BASE_DIR = Path(__file__).resolve().parent
|
||
STATIC_DIR = BASE_DIR / "static"
|
||
CONFIG_PATH = BASE_DIR / "config" / "xibao_logic.json"
|
||
DATA_DIR = BASE_DIR / "data"
|
||
DEFAULT_HISTORY_PATH = DATA_DIR / "generated_history.json"
|
||
DEFAULT_OUTPUT_DIR = BASE_DIR / "output"
|
||
DEFAULT_TEMPLATE_FALLBACK = BASE_DIR.parent / "黄金三十天喜报模版(余额、保险、理财)(1).pptx"
|
||
REVIEW_LOG_DIR = DATA_DIR / "review_logs"
|
||
MANUAL_RULES_PATH = DATA_DIR / "manual_rules.json"
|
||
ISSUE_MARKS_PATH = DATA_DIR / "marked_issues.json"
|
||
SKIPPED_SUPPRESS_PATH = DATA_DIR / "skipped_suppressions.json"
|
||
CONVERTER_IMAGE = "minidocks/libreoffice"
|
||
LOCAL_LIBREOFFICE_BIN = shutil.which("libreoffice") or shutil.which("soffice")
|
||
LOCAL_PDFTOPPM_BIN = shutil.which("pdftoppm")
|
||
|
||
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
||
DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
REVIEW_LOG_DIR.mkdir(parents=True, exist_ok=True)
|
||
_HISTORY_LOCK = threading.Lock()
|
||
_DOWNLOAD_LOCK = threading.Lock()
|
||
_CONVERTER_LOCK = threading.Lock()
|
||
_LOG_LOCK = threading.Lock()
|
||
_PROGRESS_LOCK = threading.Lock()
|
||
_RULES_LOCK = threading.Lock()
|
||
_ISSUE_LOCK = threading.Lock()
|
||
_SKIP_SUPPRESS_LOCK = threading.Lock()
|
||
_ACTIVE_JOB_LOCK = threading.Lock()
|
||
_GENERATE_SLOT_LOCK = threading.Lock()
|
||
_GENERATE_STATE_LOCK = threading.Lock()
|
||
DOWNLOAD_CACHE: dict[str, dict[str, Any]] = {}
|
||
GEN_PROGRESS: dict[str, dict[str, Any]] = {}
|
||
ACTIVE_JOB_DIRS: set[str] = set()
|
||
ACTIVE_GENERATION: dict[str, Any] = {"token": "", "started_at": 0.0}
|
||
CONVERTER_IMAGE_READY = False
|
||
_MALLOC_TRIM_FN: Callable[[int], int] | None = None
|
||
|
||
TERM_DEFS = [
|
||
{
|
||
"type": "三个月定期",
|
||
"page": "page_0",
|
||
"regex": re.compile(r"(?:三|3)\s*个?月"),
|
||
},
|
||
{
|
||
"type": "六个月定期",
|
||
"page": "page_1",
|
||
"regex": re.compile(r"(?:六|6)\s*个?月|半年"),
|
||
},
|
||
{
|
||
"type": "一年期定期",
|
||
"page": "page_2",
|
||
"regex": re.compile(r"(?:定期)?\s*(?:存\s*)?(?:一|1)\s*年(?:期)?|定期一年|存1年"),
|
||
},
|
||
]
|
||
|
||
CHINESE_DIGIT_MAP = {
|
||
"零": 0,
|
||
"〇": 0,
|
||
"一": 1,
|
||
"二": 2,
|
||
"两": 2,
|
||
"三": 3,
|
||
"四": 4,
|
||
"五": 5,
|
||
"六": 6,
|
||
"七": 7,
|
||
"八": 8,
|
||
"九": 9,
|
||
"壹": 1,
|
||
"贰": 2,
|
||
"叁": 3,
|
||
"肆": 4,
|
||
"伍": 5,
|
||
"陆": 6,
|
||
"柒": 7,
|
||
"捌": 8,
|
||
"玖": 9,
|
||
}
|
||
CHINESE_UNIT_MAP = {"十": 10, "百": 100, "千": 1000, "万": 10000, "亿": 100000000}
|
||
CHINESE_NUMBER_RE = re.compile(r"[零〇一二两三四五六七八九十百千万亿壹贰叁肆伍陆柒捌玖拾佰仟点點]+")
|
||
STATUS_HEURISTIC_PATTERNS: dict[str, list[re.Pattern[str]]] = {
|
||
"揽收现金": [
|
||
re.compile(r"揽收[^,。,;;\n]{0,24}现金"),
|
||
],
|
||
"揽收彩礼": [
|
||
re.compile(r"揽收[^,。,;;\n]{0,24}(?:彩礼|礼金)"),
|
||
re.compile(r"(?:彩礼|礼金)[^,。,;;\n]{0,24}揽收"),
|
||
],
|
||
"微信提现": [
|
||
re.compile(r"(?:微信|支付宝)[^,。,;;\n]{0,16}提现"),
|
||
re.compile(r"提现[^,。,;;\n]{0,16}(?:微信|支付宝)"),
|
||
],
|
||
"商户提现": [
|
||
re.compile(r"商户[^,。,;;\n]{0,16}提现"),
|
||
re.compile(r"提现[^,。,;;\n]{0,16}商户"),
|
||
],
|
||
"揽收商户": [
|
||
re.compile(r"揽收[^,。,;;\n]{0,16}商户"),
|
||
re.compile(r"商户[^,。,;;\n]{0,16}揽收"),
|
||
],
|
||
"揽收他行": [
|
||
re.compile(r"揽收[^,。,;;\n]{0,20}他行"),
|
||
re.compile(r"他行[^,。,;;\n]{0,20}揽收"),
|
||
],
|
||
"他行挖转": [
|
||
re.compile(r"挖转[^,。,;;\n]{0,20}他行"),
|
||
re.compile(r"他行[^,。,;;\n]{0,20}挖转"),
|
||
re.compile(r"挖他行"),
|
||
],
|
||
}
|
||
|
||
|
||
class ConfigError(RuntimeError):
|
||
pass
|
||
|
||
|
||
def now_ts() -> str:
|
||
return datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
|
||
|
||
def load_config() -> dict[str, Any]:
|
||
if not CONFIG_PATH.exists():
|
||
raise ConfigError(f"config file not found: {CONFIG_PATH}")
|
||
|
||
with CONFIG_PATH.open("r", encoding="utf-8") as f:
|
||
config = json.load(f)
|
||
|
||
if not isinstance(config, dict):
|
||
raise ConfigError("config root must be a JSON object")
|
||
|
||
return config
|
||
|
||
|
||
def is_windows_path(path_text: str) -> bool:
|
||
return bool(re.match(r"^[A-Za-z]:\\", path_text))
|
||
|
||
|
||
def resolve_history_path(config: dict[str, Any]) -> Path:
|
||
history_file = str(config.get("history_file", "")).strip()
|
||
if history_file and not is_windows_path(history_file):
|
||
p = Path(history_file)
|
||
if not p.is_absolute():
|
||
p = BASE_DIR / p
|
||
p.parent.mkdir(parents=True, exist_ok=True)
|
||
return p
|
||
|
||
DEFAULT_HISTORY_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||
return DEFAULT_HISTORY_PATH
|
||
|
||
|
||
def resolve_template_path(config: dict[str, Any], override_path: str | None = None) -> Path:
|
||
candidates: list[Path] = []
|
||
|
||
if override_path:
|
||
p = Path(override_path)
|
||
if not p.is_absolute():
|
||
p = BASE_DIR / p
|
||
candidates.append(p)
|
||
|
||
cfg_template = str(config.get("template_file", "")).strip()
|
||
if cfg_template and not is_windows_path(cfg_template):
|
||
p = Path(cfg_template)
|
||
if not p.is_absolute():
|
||
p = BASE_DIR / p
|
||
candidates.append(p)
|
||
|
||
env_template = os.environ.get("XIBAO_TEMPLATE_FILE", "").strip()
|
||
if env_template:
|
||
p = Path(env_template)
|
||
if not p.is_absolute():
|
||
p = BASE_DIR / p
|
||
candidates.append(p)
|
||
|
||
candidates.append(DEFAULT_TEMPLATE_FALLBACK)
|
||
|
||
for p in candidates:
|
||
if p.exists() and p.is_file():
|
||
return p
|
||
|
||
raise ConfigError(
|
||
"template file not found. Set config.template_file to a valid local path "
|
||
"or pass template_file in request body."
|
||
)
|
||
|
||
|
||
def resolve_output_dir(config: dict[str, Any], override_path: str | None = None) -> Path:
|
||
if override_path:
|
||
p = Path(override_path)
|
||
if not p.is_absolute():
|
||
p = BASE_DIR / p
|
||
p.mkdir(parents=True, exist_ok=True)
|
||
return p
|
||
|
||
output_dir = str(config.get("output_dir", "")).strip()
|
||
if output_dir and not is_windows_path(output_dir):
|
||
p = Path(output_dir)
|
||
if not p.is_absolute():
|
||
p = BASE_DIR / p
|
||
p.mkdir(parents=True, exist_ok=True)
|
||
return p
|
||
|
||
DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
return DEFAULT_OUTPUT_DIR
|
||
|
||
|
||
def load_history(history_path: Path) -> list[dict[str, Any]]:
|
||
if not history_path.exists():
|
||
return []
|
||
|
||
with history_path.open("r", encoding="utf-8") as f:
|
||
raw = json.load(f)
|
||
|
||
if isinstance(raw, list):
|
||
return [r for r in raw if isinstance(r, dict)]
|
||
|
||
if isinstance(raw, dict) and isinstance(raw.get("records"), list):
|
||
return [r for r in raw["records"] if isinstance(r, dict)]
|
||
|
||
return []
|
||
|
||
|
||
def save_history(history_path: Path, records: list[dict[str, Any]]) -> None:
|
||
with history_path.open("w", encoding="utf-8") as f:
|
||
json.dump(records, f, ensure_ascii=False, indent=2)
|
||
|
||
|
||
def load_manual_rules() -> list[dict[str, Any]]:
|
||
if not MANUAL_RULES_PATH.exists():
|
||
return []
|
||
with _RULES_LOCK:
|
||
try:
|
||
with MANUAL_RULES_PATH.open("r", encoding="utf-8") as f:
|
||
raw = json.load(f)
|
||
except Exception:
|
||
return []
|
||
if not isinstance(raw, list):
|
||
return []
|
||
return [x for x in raw if isinstance(x, dict)]
|
||
|
||
|
||
def save_manual_rules(rules: list[dict[str, Any]]) -> None:
|
||
with _RULES_LOCK:
|
||
with MANUAL_RULES_PATH.open("w", encoding="utf-8") as f:
|
||
json.dump(rules, f, ensure_ascii=False, indent=2)
|
||
|
||
|
||
def _load_issue_marks_unlocked() -> list[dict[str, Any]]:
|
||
if not ISSUE_MARKS_PATH.exists():
|
||
return []
|
||
try:
|
||
with ISSUE_MARKS_PATH.open("r", encoding="utf-8") as f:
|
||
raw = json.load(f)
|
||
except Exception:
|
||
return []
|
||
if not isinstance(raw, list):
|
||
return []
|
||
return [x for x in raw if isinstance(x, dict)]
|
||
|
||
|
||
def _save_issue_marks_unlocked(items: list[dict[str, Any]]) -> None:
|
||
ISSUE_MARKS_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||
with ISSUE_MARKS_PATH.open("w", encoding="utf-8") as f:
|
||
json.dump(items, f, ensure_ascii=False, indent=2)
|
||
|
||
|
||
def list_issue_marks(status: str = "active", limit: int = 500) -> tuple[list[dict[str, Any]], int]:
|
||
status_norm = str(status or "active").strip().lower()
|
||
if status_norm not in {"active", "resolved", "all"}:
|
||
status_norm = "active"
|
||
limit = max(1, min(2000, int(limit)))
|
||
|
||
with _ISSUE_LOCK:
|
||
items = _load_issue_marks_unlocked()
|
||
|
||
normalized: list[dict[str, Any]] = []
|
||
for item in items:
|
||
obj = dict(item)
|
||
item_status = str(obj.get("status", "active")).strip().lower()
|
||
if item_status not in {"active", "resolved"}:
|
||
item_status = "active"
|
||
obj["status"] = item_status
|
||
normalized.append(obj)
|
||
|
||
if status_norm == "all":
|
||
filtered = normalized
|
||
else:
|
||
filtered = [x for x in normalized if str(x.get("status", "active")) == status_norm]
|
||
|
||
filtered.sort(
|
||
key=lambda x: str(x.get("updated_at") or x.get("created_at") or ""),
|
||
reverse=True,
|
||
)
|
||
total = len(filtered)
|
||
return filtered[:limit], total
|
||
|
||
|
||
def upsert_issue_mark(
|
||
*,
|
||
mark_type: str,
|
||
source_line: str,
|
||
note: str = "",
|
||
record: dict[str, Any] | None = None,
|
||
) -> tuple[dict[str, Any], bool]:
|
||
mark_type_text = str(mark_type).strip()
|
||
if mark_type_text not in {"recognition_error", "generation_error"}:
|
||
raise ValueError("mark_type must be recognition_error or generation_error")
|
||
|
||
line_text = str(source_line).strip()
|
||
if not line_text:
|
||
raise ValueError("source_line is required")
|
||
|
||
now = datetime.now().isoformat(timespec="seconds")
|
||
record_obj = dict(record) if isinstance(record, dict) else {}
|
||
note_text = str(note).strip()
|
||
|
||
with _ISSUE_LOCK:
|
||
items = _load_issue_marks_unlocked()
|
||
for item in items:
|
||
if not isinstance(item, dict):
|
||
continue
|
||
if str(item.get("status", "active")).strip().lower() != "active":
|
||
continue
|
||
if str(item.get("mark_type", "")).strip() != mark_type_text:
|
||
continue
|
||
if str(item.get("source_line", "")).strip() != line_text:
|
||
continue
|
||
item["note"] = note_text
|
||
item["record"] = record_obj
|
||
item["updated_at"] = now
|
||
_save_issue_marks_unlocked(items)
|
||
return dict(item), False
|
||
|
||
issue = {
|
||
"id": uuid.uuid4().hex[:12],
|
||
"mark_type": mark_type_text,
|
||
"source_line": line_text,
|
||
"note": note_text,
|
||
"record": record_obj,
|
||
"status": "active",
|
||
"created_at": now,
|
||
"updated_at": now,
|
||
}
|
||
items.append(issue)
|
||
_save_issue_marks_unlocked(items)
|
||
return dict(issue), True
|
||
|
||
|
||
def update_issue_mark(
|
||
*,
|
||
issue_id: str,
|
||
mark_type: str | None = None,
|
||
source_line: str | None = None,
|
||
note: str | None = None,
|
||
record: dict[str, Any] | None = None,
|
||
) -> dict[str, Any] | None:
|
||
issue_id_text = str(issue_id).strip()
|
||
if not issue_id_text:
|
||
raise ValueError("id is required")
|
||
|
||
now = datetime.now().isoformat(timespec="seconds")
|
||
with _ISSUE_LOCK:
|
||
items = _load_issue_marks_unlocked()
|
||
target = None
|
||
for item in items:
|
||
if str(item.get("id", "")).strip() == issue_id_text:
|
||
target = item
|
||
break
|
||
if target is None:
|
||
return None
|
||
|
||
if mark_type is not None:
|
||
mark_type_text = str(mark_type).strip()
|
||
if mark_type_text not in {"recognition_error", "generation_error"}:
|
||
raise ValueError("mark_type must be recognition_error or generation_error")
|
||
target["mark_type"] = mark_type_text
|
||
|
||
if source_line is not None:
|
||
source_line_text = str(source_line).strip()
|
||
if not source_line_text:
|
||
raise ValueError("source_line is required")
|
||
target["source_line"] = source_line_text
|
||
|
||
if note is not None:
|
||
target["note"] = str(note).strip()
|
||
|
||
if record is not None:
|
||
target["record"] = dict(record) if isinstance(record, dict) else {}
|
||
|
||
target["updated_at"] = now
|
||
_save_issue_marks_unlocked(items)
|
||
return dict(target)
|
||
|
||
|
||
def delete_issue_mark(issue_id: str) -> bool:
|
||
issue_id_text = str(issue_id).strip()
|
||
if not issue_id_text:
|
||
raise ValueError("id is required")
|
||
|
||
with _ISSUE_LOCK:
|
||
items = _load_issue_marks_unlocked()
|
||
remain: list[dict[str, Any]] = []
|
||
deleted = False
|
||
for item in items:
|
||
if str(item.get("id", "")).strip() == issue_id_text:
|
||
deleted = True
|
||
continue
|
||
remain.append(item)
|
||
if deleted:
|
||
_save_issue_marks_unlocked(remain)
|
||
return deleted
|
||
|
||
|
||
def resolve_issue_marks_by_source_line(source_line: str, reason: str = "") -> dict[str, Any]:
|
||
line_text = str(source_line).strip()
|
||
if not line_text:
|
||
return {"count": 0, "ids": []}
|
||
|
||
now = datetime.now().isoformat(timespec="seconds")
|
||
resolved_ids: list[str] = []
|
||
with _ISSUE_LOCK:
|
||
items = _load_issue_marks_unlocked()
|
||
changed = False
|
||
for item in items:
|
||
if str(item.get("status", "active")).strip().lower() != "active":
|
||
continue
|
||
if str(item.get("source_line", "")).strip() != line_text:
|
||
continue
|
||
item["status"] = "resolved"
|
||
item["resolved_at"] = now
|
||
item["updated_at"] = now
|
||
if reason:
|
||
item["resolved_reason"] = reason
|
||
resolved_ids.append(str(item.get("id", "")).strip())
|
||
changed = True
|
||
if changed:
|
||
_save_issue_marks_unlocked(items)
|
||
return {"count": len([x for x in resolved_ids if x]), "ids": [x for x in resolved_ids if x]}
|
||
|
||
|
||
def normalize_skip_line(line: str) -> str:
|
||
return re.sub(r"\s+", "", str(line or "").strip())
|
||
|
||
|
||
def skip_suppression_key(line: str, reason: str = "") -> str:
|
||
normalized_line = normalize_skip_line(line)
|
||
normalized_reason = str(reason or "").strip()
|
||
if not normalized_line:
|
||
return ""
|
||
return f"{normalized_reason}|{normalized_line}"
|
||
|
||
|
||
def skip_suppression_keys_for_item(line: str, reason: str = "") -> list[str]:
|
||
normalized_line = normalize_skip_line(line)
|
||
if not normalized_line:
|
||
return []
|
||
normalized_reason = str(reason or "").strip()
|
||
keys = [f"*|{normalized_line}"]
|
||
if normalized_reason:
|
||
keys.insert(0, f"{normalized_reason}|{normalized_line}")
|
||
return keys
|
||
|
||
|
||
def _load_skip_suppressions_unlocked() -> list[dict[str, Any]]:
|
||
if not SKIPPED_SUPPRESS_PATH.exists():
|
||
return []
|
||
try:
|
||
with SKIPPED_SUPPRESS_PATH.open("r", encoding="utf-8") as f:
|
||
raw = json.load(f)
|
||
except Exception:
|
||
return []
|
||
if not isinstance(raw, list):
|
||
return []
|
||
return [x for x in raw if isinstance(x, dict)]
|
||
|
||
|
||
def _save_skip_suppressions_unlocked(items: list[dict[str, Any]]) -> None:
|
||
SKIPPED_SUPPRESS_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||
with SKIPPED_SUPPRESS_PATH.open("w", encoding="utf-8") as f:
|
||
json.dump(items, f, ensure_ascii=False, indent=2)
|
||
|
||
|
||
def list_skip_suppressions() -> list[dict[str, Any]]:
|
||
with _SKIP_SUPPRESS_LOCK:
|
||
return _load_skip_suppressions_unlocked()
|
||
|
||
|
||
def build_skip_suppression_lookup(items: list[dict[str, Any]]) -> set[str]:
|
||
keys: set[str] = set()
|
||
for item in items:
|
||
if not isinstance(item, dict):
|
||
continue
|
||
key = str(item.get("key", "")).strip()
|
||
if key:
|
||
keys.add(key)
|
||
continue
|
||
line = str(item.get("line", "")).strip()
|
||
reason = str(item.get("reason", "")).strip()
|
||
fallback = skip_suppression_key(line, reason)
|
||
if fallback:
|
||
keys.add(fallback)
|
||
return keys
|
||
|
||
|
||
def is_skip_item_suppressed(line: str, reason: str, lookup: set[str]) -> bool:
|
||
for key in skip_suppression_keys_for_item(line, reason):
|
||
if key in lookup:
|
||
return True
|
||
return False
|
||
|
||
|
||
def suppress_skip_item(line: str, reason: str = "") -> tuple[dict[str, Any], bool]:
|
||
line_text = str(line or "").strip()
|
||
if not line_text:
|
||
raise ValueError("line is required")
|
||
reason_text = str(reason or "").strip()
|
||
# "*" means suppress the same line regardless of reason.
|
||
normalized_reason = reason_text if reason_text else "*"
|
||
key = skip_suppression_key(line_text, normalized_reason)
|
||
if not key:
|
||
raise ValueError("line is required")
|
||
|
||
now = datetime.now().isoformat(timespec="seconds")
|
||
with _SKIP_SUPPRESS_LOCK:
|
||
items = _load_skip_suppressions_unlocked()
|
||
for item in items:
|
||
if str(item.get("key", "")).strip() != key:
|
||
continue
|
||
item["updated_at"] = now
|
||
_save_skip_suppressions_unlocked(items)
|
||
return dict(item), False
|
||
|
||
obj = {
|
||
"id": uuid.uuid4().hex[:12],
|
||
"key": key,
|
||
"line": line_text,
|
||
"reason": normalized_reason,
|
||
"created_at": now,
|
||
"updated_at": now,
|
||
}
|
||
items.append(obj)
|
||
_save_skip_suppressions_unlocked(items)
|
||
return obj, True
|
||
|
||
|
||
def clear_skip_suppressions() -> int:
|
||
with _SKIP_SUPPRESS_LOCK:
|
||
items = _load_skip_suppressions_unlocked()
|
||
count = len(items)
|
||
_save_skip_suppressions_unlocked([])
|
||
return count
|
||
|
||
|
||
def register_active_job_dir(job_dir: Path) -> None:
|
||
with _ACTIVE_JOB_LOCK:
|
||
ACTIVE_JOB_DIRS.add(str(job_dir.resolve()))
|
||
|
||
|
||
def unregister_active_job_dir(job_dir: Path) -> None:
|
||
with _ACTIVE_JOB_LOCK:
|
||
ACTIVE_JOB_DIRS.discard(str(job_dir.resolve()))
|
||
|
||
|
||
def get_active_job_dirs() -> set[str]:
|
||
with _ACTIVE_JOB_LOCK:
|
||
return set(ACTIVE_JOB_DIRS)
|
||
|
||
|
||
def acquire_generation_slot(token: str = "") -> bool:
|
||
if not _GENERATE_SLOT_LOCK.acquire(blocking=False):
|
||
return False
|
||
with _GENERATE_STATE_LOCK:
|
||
ACTIVE_GENERATION["token"] = str(token).strip()
|
||
ACTIVE_GENERATION["started_at"] = time.time()
|
||
return True
|
||
|
||
|
||
def release_generation_slot(token: str = "") -> None:
|
||
_ = token
|
||
with _GENERATE_STATE_LOCK:
|
||
ACTIVE_GENERATION["token"] = ""
|
||
ACTIVE_GENERATION["started_at"] = 0.0
|
||
if _GENERATE_SLOT_LOCK.locked():
|
||
try:
|
||
_GENERATE_SLOT_LOCK.release()
|
||
except RuntimeError:
|
||
pass
|
||
|
||
|
||
def get_active_generation() -> dict[str, Any]:
|
||
with _GENERATE_STATE_LOCK:
|
||
return dict(ACTIVE_GENERATION)
|
||
|
||
|
||
def today_log_path() -> Path:
|
||
day = datetime.now().strftime("%Y-%m-%d")
|
||
return REVIEW_LOG_DIR / f"review_{day}.jsonl"
|
||
|
||
|
||
def count_log_lines(path: Path) -> int:
|
||
if not path.exists():
|
||
return 0
|
||
n = 0
|
||
with path.open("r", encoding="utf-8") as f:
|
||
for _ in f:
|
||
n += 1
|
||
return n
|
||
|
||
|
||
def append_review_log(event: str, payload: dict[str, Any] | None = None) -> Path:
|
||
REVIEW_LOG_DIR.mkdir(parents=True, exist_ok=True)
|
||
log_path = today_log_path()
|
||
entry = {
|
||
"ts": datetime.now().isoformat(timespec="seconds"),
|
||
"event": str(event),
|
||
}
|
||
if isinstance(payload, dict):
|
||
entry.update(payload)
|
||
|
||
with _LOG_LOCK:
|
||
with log_path.open("a", encoding="utf-8") as f:
|
||
f.write(json.dumps(entry, ensure_ascii=False))
|
||
f.write("\n")
|
||
return log_path
|
||
|
||
|
||
def get_valid_status_list(config: dict[str, Any]) -> list[str]:
|
||
valid_status = config.get("replace_algorithm", {}).get("status", {}).get("valid_status", [])
|
||
if not isinstance(valid_status, list) or not valid_status:
|
||
valid_status = config.get("status_extraction", {}).get("valid_status", [])
|
||
if not isinstance(valid_status, list):
|
||
return []
|
||
return [str(x).strip() for x in valid_status if str(x).strip()]
|
||
|
||
|
||
def normalize_status_value(status: str, config: dict[str, Any]) -> str:
|
||
fallback = str(config.get("status_extraction", {}).get("fallback", "成功营销")).strip() or "成功营销"
|
||
valid_status = get_valid_status_list(config)
|
||
if not valid_status:
|
||
return str(status).strip() or fallback
|
||
s = str(status).strip()
|
||
if s in valid_status:
|
||
return s
|
||
return fallback if fallback in valid_status else valid_status[0]
|
||
|
||
|
||
def is_date_header_line(normalized_line: str) -> bool:
|
||
return bool(re.fullmatch(r"\d{1,2}月\d{1,2}[日号]?", normalized_line))
|
||
|
||
|
||
def log_parse_skipped(skipped: list[dict[str, Any]], source: str) -> int:
|
||
written = 0
|
||
for item in skipped:
|
||
if not isinstance(item, dict):
|
||
continue
|
||
reason = str(item.get("reason", "")).strip()
|
||
# 说明行不记录,避免日志噪音。
|
||
if not reason or reason == "skip_line_rule":
|
||
continue
|
||
source_line = str(item.get("line", "")).strip()
|
||
append_review_log(
|
||
"parse_skip",
|
||
{
|
||
"source": source,
|
||
"reason": reason,
|
||
"source_line": source_line,
|
||
},
|
||
)
|
||
written += 1
|
||
return written
|
||
|
||
|
||
def is_path_under(path: Path, parent: Path) -> bool:
|
||
try:
|
||
path.resolve().relative_to(parent.resolve())
|
||
return True
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
def cleanup_output_artifacts(output_dir: Path) -> dict[str, int]:
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
removed_dirs = 0
|
||
removed_files = 0
|
||
errors = 0
|
||
active_dirs = get_active_job_dirs()
|
||
|
||
for entry in output_dir.iterdir():
|
||
try:
|
||
if entry.is_dir() and entry.name.startswith("job_"):
|
||
if str(entry.resolve()) in active_dirs:
|
||
continue
|
||
shutil.rmtree(entry)
|
||
removed_dirs += 1
|
||
continue
|
||
if entry.is_file() and entry.suffix.lower() in {".png", ".pdf", ".pptx", ".zip"}:
|
||
entry.unlink(missing_ok=True)
|
||
removed_files += 1
|
||
except Exception:
|
||
errors += 1
|
||
|
||
with _DOWNLOAD_LOCK:
|
||
stale_tokens: list[str] = []
|
||
for token, meta in DOWNLOAD_CACHE.items():
|
||
file_path = Path(str(meta.get("file_path", "")))
|
||
if not file_path.exists() or is_path_under(file_path, output_dir):
|
||
stale_tokens.append(token)
|
||
for token in stale_tokens:
|
||
DOWNLOAD_CACHE.pop(token, None)
|
||
|
||
return {
|
||
"removed_dirs": removed_dirs,
|
||
"removed_files": removed_files,
|
||
"errors": errors,
|
||
}
|
||
|
||
|
||
def daily_cleanup_loop(stop_event: threading.Event) -> None:
|
||
while not stop_event.is_set():
|
||
now = datetime.now()
|
||
next_midnight = (now + timedelta(days=1)).replace(hour=0, minute=0, second=0, microsecond=0)
|
||
wait_seconds = max((next_midnight - now).total_seconds(), 1.0)
|
||
if stop_event.wait(wait_seconds):
|
||
break
|
||
|
||
try:
|
||
config = load_config()
|
||
output_dir = resolve_output_dir(config)
|
||
stat = cleanup_output_artifacts(output_dir)
|
||
print(
|
||
"[daily-cleanup] done at midnight, "
|
||
f"removed_dirs={stat['removed_dirs']} removed_files={stat['removed_files']} errors={stat['errors']}"
|
||
)
|
||
append_review_log(
|
||
"daily_cleanup",
|
||
{
|
||
"output_dir": str(output_dir),
|
||
**stat,
|
||
},
|
||
)
|
||
except Exception as exc:
|
||
print(f"[daily-cleanup] failed: {exc}")
|
||
append_review_log("daily_cleanup_error", {"error": str(exc)})
|
||
|
||
|
||
def build_history_lookup(history: list[dict[str, Any]], key_fields: list[str]) -> dict[str, dict[str, Any]]:
|
||
lookup: dict[str, dict[str, Any]] = {}
|
||
for item in history:
|
||
if not isinstance(item, dict):
|
||
continue
|
||
k = key_for_record(item, key_fields)
|
||
if not k:
|
||
continue
|
||
existing = lookup.get(k)
|
||
if existing is None:
|
||
lookup[k] = item
|
||
continue
|
||
prev_ts = str(existing.get("created_at", ""))
|
||
curr_ts = str(item.get("created_at", ""))
|
||
if curr_ts >= prev_ts:
|
||
lookup[k] = item
|
||
return lookup
|
||
|
||
|
||
def build_history_signature_lookup(history: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
|
||
lookup: dict[str, dict[str, Any]] = {}
|
||
for item in history:
|
||
if not isinstance(item, dict):
|
||
continue
|
||
k = signature_key_for_record(item)
|
||
if not k:
|
||
continue
|
||
existing = lookup.get(k)
|
||
if existing is None:
|
||
lookup[k] = item
|
||
continue
|
||
prev_ts = str(existing.get("created_at", ""))
|
||
curr_ts = str(item.get("created_at", ""))
|
||
if curr_ts >= prev_ts:
|
||
lookup[k] = item
|
||
return lookup
|
||
|
||
|
||
def resolve_history_image_path(history_record: dict[str, Any], config: dict[str, Any]) -> Path | None:
|
||
candidates: list[Path] = []
|
||
|
||
image_path = str(history_record.get("image_path", "")).strip()
|
||
if image_path:
|
||
p = Path(image_path)
|
||
if not p.is_absolute():
|
||
p = BASE_DIR / p
|
||
candidates.append(p)
|
||
|
||
output_dir = resolve_output_dir(config)
|
||
png_file = str(history_record.get("png_file", "")).strip()
|
||
output_file = str(history_record.get("output_file", "")).strip()
|
||
job_id = str(history_record.get("job_id", "")).strip()
|
||
|
||
if png_file and job_id:
|
||
candidates.append(output_dir / job_id / png_file)
|
||
if output_file and job_id:
|
||
candidates.append(output_dir / job_id / output_file)
|
||
|
||
if output_file:
|
||
# 兼容老历史记录:仅存 output_file 时尝试在 job_* 中定位。
|
||
for p in output_dir.glob(f"job_*/{output_file}"):
|
||
candidates.append(p)
|
||
break
|
||
|
||
seen: set[str] = set()
|
||
for c in candidates:
|
||
key = str(c)
|
||
if key in seen:
|
||
continue
|
||
seen.add(key)
|
||
if c.exists() and c.is_file():
|
||
return c
|
||
return None
|
||
|
||
|
||
def attach_duplicate_download_info(
|
||
dup_record: dict[str, Any],
|
||
history_record: dict[str, Any] | None,
|
||
config: dict[str, Any],
|
||
) -> None:
|
||
if not isinstance(history_record, dict):
|
||
return
|
||
|
||
image_path = resolve_history_image_path(history_record, config)
|
||
if not image_path:
|
||
return
|
||
|
||
token = register_download(image_path, content_type="image/png")
|
||
dup_record["download_token"] = token
|
||
dup_record["download_url"] = f"/api/download/{token}"
|
||
dup_record["image_name"] = image_path.name
|
||
if not dup_record.get("output_file"):
|
||
dup_record["output_file"] = image_path.name
|
||
|
||
|
||
def build_history_view_items(
|
||
history: list[dict[str, Any]],
|
||
config: dict[str, Any],
|
||
limit: int = 300,
|
||
) -> list[dict[str, Any]]:
|
||
items = [x for x in history if isinstance(x, dict)]
|
||
items.sort(key=lambda x: str(x.get("created_at", "")), reverse=True)
|
||
if limit > 0:
|
||
items = items[:limit]
|
||
|
||
rows: list[dict[str, Any]] = []
|
||
for item in items:
|
||
row = dict(item)
|
||
image_path = resolve_history_image_path(row, config)
|
||
if image_path:
|
||
token = register_download(image_path, content_type="image/png")
|
||
row["download_token"] = token
|
||
row["download_url"] = f"/api/download/{token}"
|
||
row["image_exists"] = True
|
||
row["image_name"] = image_path.name
|
||
else:
|
||
row["image_exists"] = False
|
||
rows.append(row)
|
||
return rows
|
||
|
||
|
||
def _cleanup_progress_cache_locked() -> None:
|
||
now = time.time()
|
||
stale: list[str] = []
|
||
for token, item in GEN_PROGRESS.items():
|
||
updated_at = float(item.get("updated_at", 0))
|
||
if now - updated_at > 3 * 3600:
|
||
stale.append(token)
|
||
for token in stale:
|
||
GEN_PROGRESS.pop(token, None)
|
||
|
||
|
||
def cleanup_progress_cache() -> None:
|
||
with _PROGRESS_LOCK:
|
||
_cleanup_progress_cache_locked()
|
||
|
||
|
||
def set_generation_progress(
|
||
token: str,
|
||
*,
|
||
status: str,
|
||
stage: str,
|
||
percent: int,
|
||
detail: str = "",
|
||
error: str = "",
|
||
) -> None:
|
||
t = str(token).strip()
|
||
if not t:
|
||
return
|
||
now_iso = datetime.now().isoformat(timespec="seconds")
|
||
item = {
|
||
"token": t,
|
||
"status": status,
|
||
"stage": stage,
|
||
"percent": max(0, min(100, int(percent))),
|
||
"detail": detail,
|
||
"error": error,
|
||
"updated_at": time.time(),
|
||
"updated_at_text": now_iso,
|
||
}
|
||
with _PROGRESS_LOCK:
|
||
_cleanup_progress_cache_locked()
|
||
GEN_PROGRESS[t] = item
|
||
|
||
|
||
def get_generation_progress(token: str) -> dict[str, Any] | None:
|
||
t = str(token).strip()
|
||
if not t:
|
||
return None
|
||
with _PROGRESS_LOCK:
|
||
_cleanup_progress_cache_locked()
|
||
item = GEN_PROGRESS.get(t)
|
||
if item is None:
|
||
return None
|
||
return dict(item)
|
||
|
||
|
||
def strip_trailing_zero(num_text: str) -> str:
|
||
if "." not in num_text:
|
||
return num_text
|
||
return num_text.rstrip("0").rstrip(".")
|
||
|
||
|
||
def floor_amount_number(value: Any) -> int | None:
|
||
s = str(value).strip()
|
||
if not s:
|
||
return None
|
||
m = re.search(r"\d+(?:\.\d+)?", s)
|
||
if not m:
|
||
return None
|
||
try:
|
||
num = float(m.group(0))
|
||
except ValueError:
|
||
return None
|
||
return max(0, int(num))
|
||
|
||
|
||
def normalize_amount_text(value: Any) -> str:
|
||
v = floor_amount_number(value)
|
||
if v is None:
|
||
return ""
|
||
return f"{v}万"
|
||
|
||
|
||
def normalize_line(line: str, line_pattern: str) -> str:
|
||
line = line.strip()
|
||
if not line:
|
||
return ""
|
||
|
||
try:
|
||
line = re.sub(line_pattern, "", line)
|
||
except re.error:
|
||
pass
|
||
|
||
line = re.sub(r"\s+", "", line)
|
||
return line.strip()
|
||
|
||
|
||
def extract_line_serial(source_line: str, line_pattern: str) -> str:
|
||
line = str(source_line).strip()
|
||
if not line:
|
||
return ""
|
||
|
||
# Prefer extracting from configured line prefix regex.
|
||
try:
|
||
m = re.match(line_pattern, line)
|
||
if m:
|
||
prefix = m.group(0)
|
||
dm = re.search(r"\d+", prefix)
|
||
if dm:
|
||
return str(int(dm.group(0)))
|
||
except re.error:
|
||
pass
|
||
|
||
# Fallback: common numbering formats (e.g. "12、", "12.", "12)").
|
||
dm = re.match(r"^\s*(\d+)\s*(?:[、,,..\))]\s*)?", line)
|
||
if dm:
|
||
return str(int(dm.group(1)))
|
||
return ""
|
||
|
||
|
||
def line_signature(normalized_line: str) -> str:
|
||
text = re.sub(r"\s+", "", str(normalized_line).strip())
|
||
if not text:
|
||
return ""
|
||
digest = hashlib.sha1(text.encode("utf-8")).hexdigest()
|
||
return digest[:16]
|
||
|
||
|
||
def extract_branch(line: str, config: dict[str, Any]) -> tuple[str | None, str | None]:
|
||
branches_cfg = config.get("branches", {})
|
||
allowed = branches_cfg.get("allowed", [])
|
||
alias = branches_cfg.get("alias", {})
|
||
|
||
candidates: list[str] = []
|
||
if isinstance(alias, dict):
|
||
candidates.extend(str(k) for k in alias.keys())
|
||
if isinstance(allowed, list):
|
||
candidates.extend(str(b) for b in allowed)
|
||
|
||
candidates = sorted(set(candidates), key=len, reverse=True)
|
||
|
||
raw_branch = None
|
||
for name in candidates:
|
||
if name and name in line:
|
||
raw_branch = name
|
||
break
|
||
|
||
if raw_branch is None:
|
||
return None, None
|
||
|
||
normalized = str(alias.get(raw_branch, raw_branch)) if isinstance(alias, dict) else raw_branch
|
||
return raw_branch, normalized
|
||
|
||
|
||
def match_status_heuristic(line: str, status: str) -> bool:
|
||
patterns = STATUS_HEURISTIC_PATTERNS.get(str(status).strip(), [])
|
||
for pattern in patterns:
|
||
if pattern.search(line):
|
||
return True
|
||
return False
|
||
|
||
|
||
def extract_status(line: str, config: dict[str, Any]) -> str:
|
||
# Normalize common wording variants before status matching.
|
||
status_line = str(line or "").replace("外行", "他行")
|
||
cfg = config.get("status_extraction", {})
|
||
rules = cfg.get("extraction_rules", {})
|
||
order = cfg.get("priority_order", [])
|
||
fallback = str(cfg.get("fallback", "成功营销"))
|
||
|
||
if not isinstance(rules, dict):
|
||
return fallback
|
||
|
||
if not isinstance(order, list) or not order:
|
||
order = [str(k) for k in rules.keys()]
|
||
|
||
for status in order:
|
||
keywords = rules.get(status, [status])
|
||
if not isinstance(keywords, list):
|
||
continue
|
||
for kw in keywords:
|
||
kw_text = str(kw)
|
||
if kw_text and kw_text in status_line:
|
||
return normalize_status_value(str(status), config)
|
||
|
||
valid_status = set(get_valid_status_list(config))
|
||
for status in order:
|
||
s = str(status).strip()
|
||
if not s:
|
||
continue
|
||
if valid_status and s not in valid_status:
|
||
continue
|
||
if match_status_heuristic(status_line, s):
|
||
return normalize_status_value(s, config)
|
||
|
||
# Heuristic: plain "提现" defaults to 微信提现 unless it clearly indicates 商户.
|
||
if "提现" in status_line:
|
||
if "商户" in status_line and "商户提现" in valid_status:
|
||
return "商户提现"
|
||
if "微信提现" in valid_status:
|
||
return "微信提现"
|
||
|
||
# Heuristic: "揽收他行40万存一年" 这类语句不能回退到“成功营销”。
|
||
if "他行" in status_line:
|
||
if "揽收他行" in valid_status:
|
||
return "揽收他行"
|
||
if "他行挖转" in valid_status:
|
||
return "他行挖转"
|
||
|
||
return normalize_status_value(fallback, config)
|
||
|
||
|
||
def normalize_chinese_number_text(text: str) -> str:
|
||
return (
|
||
str(text)
|
||
.strip()
|
||
.replace("點", "点")
|
||
.replace("拾", "十")
|
||
.replace("佰", "百")
|
||
.replace("仟", "千")
|
||
)
|
||
|
||
|
||
def chinese_integer_to_float(text: str) -> float | None:
|
||
s = normalize_chinese_number_text(text)
|
||
if not s:
|
||
return None
|
||
|
||
total = 0
|
||
section = 0
|
||
number = 0
|
||
seen = False
|
||
|
||
for ch in s:
|
||
if ch in CHINESE_DIGIT_MAP:
|
||
number = CHINESE_DIGIT_MAP[ch]
|
||
seen = True
|
||
continue
|
||
|
||
unit = CHINESE_UNIT_MAP.get(ch)
|
||
if unit is None:
|
||
return None
|
||
|
||
seen = True
|
||
if unit in {10, 100, 1000}:
|
||
if number == 0:
|
||
number = 1
|
||
section += number * unit
|
||
number = 0
|
||
continue
|
||
|
||
section += number
|
||
if section == 0:
|
||
section = 1
|
||
total += section * unit
|
||
section = 0
|
||
number = 0
|
||
|
||
if not seen:
|
||
return None
|
||
return float(total + section + number)
|
||
|
||
|
||
def chinese_text_to_float(text: str) -> float | None:
|
||
s = normalize_chinese_number_text(text)
|
||
if not s:
|
||
return None
|
||
|
||
if "点" in s:
|
||
int_part, frac_part = s.split("点", 1)
|
||
if not frac_part:
|
||
return None
|
||
int_value = chinese_integer_to_float(int_part) if int_part else 0.0
|
||
if int_value is None:
|
||
return None
|
||
frac_digits: list[str] = []
|
||
for ch in frac_part:
|
||
digit = CHINESE_DIGIT_MAP.get(ch)
|
||
if digit is None:
|
||
return None
|
||
frac_digits.append(str(digit))
|
||
return float(f"{int(int_value)}.{''.join(frac_digits)}")
|
||
|
||
return chinese_integer_to_float(s)
|
||
|
||
|
||
def chinese_digit_fraction(text: str) -> float | None:
|
||
s = normalize_chinese_number_text(text)
|
||
if not s:
|
||
return None
|
||
if not all(ch in CHINESE_DIGIT_MAP for ch in s):
|
||
return None
|
||
digits = "".join(str(CHINESE_DIGIT_MAP[ch]) for ch in s)
|
||
return int(digits) / (10 ** len(digits))
|
||
|
||
|
||
def parse_wan_suffix_shorthand(text: str) -> float | None:
|
||
s = normalize_chinese_number_text(text)
|
||
if not s:
|
||
return 0.0
|
||
|
||
if any(ch in s for ch in ("十", "百", "千", "万", "亿")):
|
||
return None
|
||
|
||
if "点" in s:
|
||
right_val = chinese_text_to_float(s)
|
||
if right_val is None:
|
||
return None
|
||
# e.g. "五点五" after "万" => 0.55 万
|
||
return right_val / 10.0
|
||
|
||
return chinese_digit_fraction(s)
|
||
|
||
|
||
def parse_chinese_amount_value(raw_text: str, right_1: str, right_2: str) -> float | None:
|
||
token = normalize_chinese_number_text(raw_text)
|
||
if not token:
|
||
return None
|
||
|
||
has_digit = any(ch in CHINESE_DIGIT_MAP for ch in token)
|
||
has_small_unit = any(ch in token for ch in ("十", "百", "千"))
|
||
if not has_digit and not has_small_unit:
|
||
return None
|
||
|
||
has_wan = ("万" in token) or (right_1 == "万") or right_2.startswith("万元")
|
||
if has_wan:
|
||
if "万" in token:
|
||
left, right = token.split("万", 1)
|
||
left_text = left.strip()
|
||
right_text = right.strip()
|
||
|
||
left_value = chinese_text_to_float(left_text) if left_text else 0.0
|
||
if left_value is None:
|
||
return None
|
||
|
||
if not right_text:
|
||
return left_value
|
||
|
||
right_norm = normalize_chinese_number_text(right_text)
|
||
|
||
# Full unit expression (e.g. 两万五千、两万零五百) -> convert absolute value to "万".
|
||
if any(ch in right_norm for ch in ("十", "百", "千", "万", "亿")):
|
||
absolute = chinese_text_to_float(token)
|
||
if absolute is None:
|
||
return None
|
||
return absolute / 10000.0
|
||
|
||
# Colloquial shorthand (e.g. 两万五 -> 2.5万, 一万二 -> 1.2万).
|
||
shorthand = parse_wan_suffix_shorthand(right_norm)
|
||
if shorthand is not None:
|
||
return left_value + shorthand
|
||
|
||
right_value = chinese_text_to_float(right_norm)
|
||
if right_value is None:
|
||
return left_value
|
||
return left_value + (right_value / 10.0)
|
||
|
||
# e.g. "五 万" where token may be "五"
|
||
value = chinese_text_to_float(token)
|
||
return value
|
||
|
||
return chinese_text_to_float(token)
|
||
|
||
|
||
def extract_amount_candidates(text: str) -> list[dict[str, Any]]:
|
||
candidates: list[dict[str, Any]] = []
|
||
|
||
for m in re.finditer(r"\d+(?:\.\d+)?", text):
|
||
num_text = m.group(0)
|
||
start, end = m.span()
|
||
|
||
right_2 = text[end : end + 2]
|
||
right_1 = text[end : end + 1]
|
||
|
||
# Exclude time/date/tenor hints: 2月23日, 5年交, 6个月 ...
|
||
if right_1 in {"年", "月", "期", "号"}:
|
||
continue
|
||
if right_2.startswith("个月"):
|
||
continue
|
||
|
||
has_wan = False
|
||
if right_1 in {"万", "W", "w"} or right_2.startswith("万元"):
|
||
has_wan = True
|
||
|
||
try:
|
||
value = float(num_text)
|
||
except ValueError:
|
||
continue
|
||
|
||
candidates.append(
|
||
{
|
||
"value": value,
|
||
"num": strip_trailing_zero(num_text),
|
||
"start": start,
|
||
"end": end,
|
||
"has_wan": has_wan,
|
||
}
|
||
)
|
||
|
||
for m in CHINESE_NUMBER_RE.finditer(text):
|
||
raw_text = m.group(0)
|
||
start, end = m.span()
|
||
|
||
right_2 = text[end : end + 2]
|
||
right_1 = text[end : end + 1]
|
||
|
||
if right_1 in {"年", "月", "期", "号"}:
|
||
continue
|
||
if right_2.startswith("个月"):
|
||
continue
|
||
|
||
# Avoid treating single Chinese digits in names as amount, e.g. "钱七/周八/吴九".
|
||
# Keep unit-bearing forms (e.g. "十万", "五万元") untouched.
|
||
if (
|
||
len(raw_text) == 1
|
||
and raw_text in CHINESE_DIGIT_MAP
|
||
and right_1 not in {"万", "元"}
|
||
and not right_2.startswith("万元")
|
||
):
|
||
continue
|
||
|
||
value = parse_chinese_amount_value(raw_text, right_1, right_2)
|
||
if value is None:
|
||
continue
|
||
|
||
has_wan = ("万" in raw_text) or (right_1 == "万") or right_2.startswith("万元")
|
||
candidates.append(
|
||
{
|
||
"value": value,
|
||
"num": strip_trailing_zero(str(value)),
|
||
"start": start,
|
||
"end": end,
|
||
"has_wan": has_wan,
|
||
}
|
||
)
|
||
|
||
return candidates
|
||
|
||
|
||
def amount_num_to_text(num_text: str) -> str:
|
||
v = floor_amount_number(num_text)
|
||
if v is None:
|
||
return ""
|
||
return f"{v}万"
|
||
|
||
|
||
def pick_amount_from_candidates(
|
||
text: str,
|
||
candidates: list[dict[str, Any]],
|
||
anchor_pos: int | None = None,
|
||
) -> str | None:
|
||
if not candidates:
|
||
return None
|
||
|
||
has_wan_candidates = [c for c in candidates if bool(c.get("has_wan"))]
|
||
|
||
if anchor_pos is None:
|
||
# Prefer "合计/共计" amount, otherwise largest amount.
|
||
preferred = has_wan_candidates or candidates
|
||
for c in preferred:
|
||
near = text[max(0, c["start"] - 2) : c["start"] + 2]
|
||
if "合计" in near or "共计" in near:
|
||
return amount_num_to_text(c["num"])
|
||
|
||
best = max(preferred, key=lambda x: x["value"])
|
||
return amount_num_to_text(best["num"])
|
||
|
||
def score(c: dict[str, Any]) -> tuple[float, int]:
|
||
distance = abs(c["start"] - anchor_pos)
|
||
|
||
left = text[max(0, c["start"] - 3) : c["start"]]
|
||
if "合计" in left or "共计" in left:
|
||
distance -= 3
|
||
|
||
# Prefer value on the left side if equally close.
|
||
side = 0 if c["start"] <= anchor_pos else 1
|
||
return (distance, side)
|
||
|
||
preferred = has_wan_candidates or candidates
|
||
best = min(preferred, key=score)
|
||
return amount_num_to_text(best["num"])
|
||
|
||
|
||
def split_segments(text: str) -> list[str]:
|
||
parts = re.split(r"[,,;;。]", text)
|
||
return [p.strip() for p in parts if p.strip()]
|
||
|
||
|
||
def normalize_insurance_year(value: Any) -> str | None:
|
||
if value is None:
|
||
return None
|
||
s = str(value).strip()
|
||
if s in {"3", "3年", "3年交", "三年", "三年交"}:
|
||
return "3"
|
||
if s in {"5", "5年", "5年交", "五年", "五年交"}:
|
||
return "5"
|
||
return None
|
||
|
||
|
||
def normalize_insurance_year_choices(value: Any) -> dict[str, str]:
|
||
if not isinstance(value, dict):
|
||
return {}
|
||
out: dict[str, str] = {}
|
||
for k, v in value.items():
|
||
key = str(k).strip()
|
||
year = normalize_insurance_year(v)
|
||
if key and year:
|
||
out[key] = year
|
||
return out
|
||
|
||
|
||
def build_insurance_choice_key(
|
||
*,
|
||
source_line: str,
|
||
raw_text: str,
|
||
branch: str,
|
||
amount: str,
|
||
page: str,
|
||
item_index: int,
|
||
) -> str:
|
||
seed = f"{source_line}|{raw_text}|{branch}|{amount}|{page}|{item_index}"
|
||
return hashlib.sha1(seed.encode("utf-8")).hexdigest()[:16]
|
||
|
||
|
||
def pick_insurance_year_for_record(
|
||
*,
|
||
choice_key: str,
|
||
source_line: str,
|
||
raw_text: str,
|
||
branch: str,
|
||
amount: str,
|
||
year_map: dict[str, str],
|
||
default_year: str | None,
|
||
) -> str | None:
|
||
candidates = [
|
||
choice_key,
|
||
source_line,
|
||
raw_text,
|
||
f"{source_line}|{amount}",
|
||
f"{raw_text}|{amount}",
|
||
f"{branch}|{amount}",
|
||
]
|
||
for key in candidates:
|
||
if key and key in year_map:
|
||
y = normalize_insurance_year(year_map.get(key))
|
||
if y in {"3", "5"}:
|
||
return y
|
||
return normalize_insurance_year(default_year)
|
||
|
||
|
||
def make_product(type_keyword: str, page: str, amount: str | None, **kwargs: Any) -> dict[str, Any]:
|
||
item = {
|
||
"type": type_keyword,
|
||
"page": page,
|
||
"amount": amount,
|
||
}
|
||
item.update(kwargs)
|
||
return item
|
||
|
||
|
||
def detect_products(
|
||
line: str,
|
||
config: dict[str, Any],
|
||
insurance_year_choice: str | None,
|
||
) -> list[dict[str, Any]]:
|
||
insurance_cfg = config.get("insurance_handling", {})
|
||
insurance_page = str(insurance_cfg.get("page", "page_3")) if isinstance(insurance_cfg, dict) else "page_3"
|
||
|
||
products: list[dict[str, Any]] = []
|
||
seen: set[str] = set()
|
||
|
||
segments = split_segments(line)
|
||
if not segments:
|
||
segments = [line]
|
||
line_candidates = extract_amount_candidates(line)
|
||
seg_search_pos = 0
|
||
|
||
def locate_segment_start(seg_text: str) -> int:
|
||
nonlocal seg_search_pos
|
||
if not seg_text:
|
||
return 0
|
||
pos = line.find(seg_text, seg_search_pos)
|
||
if pos < 0:
|
||
pos = line.find(seg_text)
|
||
if pos >= 0:
|
||
seg_search_pos = pos + len(seg_text)
|
||
return pos
|
||
return 0
|
||
|
||
def pick_segment_amount(
|
||
seg_text: str,
|
||
seg_candidates: list[dict[str, Any]],
|
||
seg_anchor: int | None,
|
||
seg_start: int,
|
||
) -> str | None:
|
||
amount = pick_amount_from_candidates(seg_text, seg_candidates, seg_anchor)
|
||
if amount:
|
||
return amount
|
||
if not line_candidates:
|
||
return None
|
||
line_anchor = None
|
||
if isinstance(seg_anchor, int) and seg_anchor >= 0:
|
||
line_anchor = seg_start + seg_anchor
|
||
return pick_amount_from_candidates(line, line_candidates, line_anchor)
|
||
|
||
for seg in segments:
|
||
seg_candidates = extract_amount_candidates(seg)
|
||
seg_start = locate_segment_start(seg)
|
||
|
||
# 1) Insurance (always mapped to 期交 page)
|
||
if "保险" in seg or "期交" in seg or re.search(r"(?:3|5|三|五)\s*年交", seg):
|
||
year = None
|
||
if re.search(r"(?:3|三)\s*年交", seg):
|
||
year = "3"
|
||
elif re.search(r"(?:5|五)\s*年交", seg):
|
||
year = "5"
|
||
elif insurance_year_choice in {"3", "5"}:
|
||
year = insurance_year_choice
|
||
|
||
insurance_anchor = seg.find("保险")
|
||
if insurance_anchor < 0:
|
||
insurance_anchor = 0
|
||
amount = pick_segment_amount(seg, seg_candidates, insurance_anchor, seg_start)
|
||
|
||
if year == "3":
|
||
type_keyword = "3年交"
|
||
elif year == "5":
|
||
type_keyword = "5年交"
|
||
else:
|
||
type_keyword = "保险"
|
||
|
||
key = f"{type_keyword}|{insurance_page}|{amount}"
|
||
if key not in seen:
|
||
products.append(
|
||
make_product(
|
||
type_keyword,
|
||
insurance_page,
|
||
amount,
|
||
is_insurance=True,
|
||
needs_insurance_year=(year is None),
|
||
)
|
||
)
|
||
seen.add(key)
|
||
|
||
# 2) Wealth/Fund/Asset
|
||
if "理财" in seg:
|
||
amount = pick_segment_amount(seg, seg_candidates, seg.find("理财"), seg_start)
|
||
key = f"理财|page_5|{amount}"
|
||
if key not in seen:
|
||
products.append(make_product("理财", "page_5", amount))
|
||
seen.add(key)
|
||
|
||
if "基金" in seg:
|
||
amount = pick_segment_amount(seg, seg_candidates, seg.find("基金"), seg_start)
|
||
key = f"基金|page_7|{amount}"
|
||
if key not in seen:
|
||
products.append(make_product("基金", "page_7", amount))
|
||
seen.add(key)
|
||
|
||
if "资管" in seg:
|
||
amount = pick_segment_amount(seg, seg_candidates, seg.find("资管"), seg_start)
|
||
key = f"资管|page_6|{amount}"
|
||
if key not in seen:
|
||
products.append(make_product("资管", "page_6", amount))
|
||
seen.add(key)
|
||
|
||
# 3) Fixed-term deposit variants
|
||
for term in TERM_DEFS:
|
||
rgx = term["regex"]
|
||
for m in rgx.finditer(seg):
|
||
# Avoid interpreting "2年理财" as one-year term.
|
||
if term["page"] == "page_2" and "理财" in seg and "定期" not in seg and "存" not in seg:
|
||
continue
|
||
|
||
amount = pick_segment_amount(seg, seg_candidates, m.start(), seg_start)
|
||
key = f"{term['type']}|{term['page']}|{amount}"
|
||
if key not in seen:
|
||
products.append(make_product(str(term["type"]), str(term["page"]), amount))
|
||
seen.add(key)
|
||
|
||
return products
|
||
|
||
|
||
def is_demand_deposit_only_line(line: str) -> bool:
|
||
# "活期"类关键词但没有明确期限(3/6个月、半年、1年、一年)时,默认不生成。
|
||
demand_markers = [
|
||
"微信提现",
|
||
"微信支付宝提现",
|
||
"支付宝提现",
|
||
"挖转",
|
||
"他行",
|
||
"揽收现金",
|
||
"存量提升",
|
||
"揽收商户",
|
||
"商户提现",
|
||
"揽收彩礼",
|
||
]
|
||
explicit_product_markers = [
|
||
"保险",
|
||
"年交",
|
||
"理财",
|
||
"基金",
|
||
"资管",
|
||
"趸交",
|
||
"三个月",
|
||
"3个月",
|
||
"六个月",
|
||
"6个月",
|
||
"半年",
|
||
"一年",
|
||
"1年",
|
||
]
|
||
|
||
if not extract_amount_candidates(line):
|
||
return False
|
||
if not any(x in line for x in demand_markers):
|
||
return False
|
||
if any(x in line for x in explicit_product_markers):
|
||
return False
|
||
return True
|
||
|
||
|
||
def key_for_record(record: dict[str, Any], key_fields: list[str]) -> str:
|
||
line_serial = str(record.get("line_serial", "")).strip()
|
||
if line_serial:
|
||
line_product_index = str(record.get("line_product_index", "")).strip() or "1"
|
||
sig = str(record.get("line_signature", "")).strip()
|
||
if not sig:
|
||
sig = line_signature(str(record.get("raw_text", "")))
|
||
parts = [f"line:{line_serial}", f"item:{line_product_index}"]
|
||
if sig:
|
||
parts.append(f"sig:{sig}")
|
||
return "|".join(parts)
|
||
|
||
return "|".join(str(record.get(k, "")).strip() for k in key_fields)
|
||
|
||
|
||
def signature_key_for_record(record: dict[str, Any]) -> str:
|
||
line_product_index = str(record.get("line_product_index", "")).strip() or "1"
|
||
sig = str(record.get("line_signature", "")).strip()
|
||
if not sig:
|
||
raw_text = str(record.get("raw_text", "")).strip()
|
||
if raw_text:
|
||
sig = line_signature(raw_text)
|
||
else:
|
||
source_line = str(record.get("source_line", "")).strip()
|
||
if source_line:
|
||
normalized_source = normalize_line(source_line, r"^\s*\d+\s*(?:[、,,..\))]\s*)?")
|
||
sig = line_signature(normalized_source)
|
||
if not sig:
|
||
return ""
|
||
return f"sig:{sig}|item:{line_product_index}"
|
||
|
||
|
||
def render_output_filename(config: dict[str, Any], record: dict[str, Any], index: int) -> str:
|
||
settings = config.get("output_settings", {})
|
||
pattern = "喜报_{branch}_{index}.png"
|
||
if isinstance(settings, dict):
|
||
pattern = str(settings.get("output_pattern", pattern))
|
||
try:
|
||
name = pattern.format(branch=record.get("branch", "未知网点"), index=index)
|
||
except Exception:
|
||
name = f"喜报_{record.get('branch', '未知网点')}_{index}.png"
|
||
|
||
if not name.lower().endswith(".png"):
|
||
name = f"{name}.png"
|
||
return name
|
||
|
||
|
||
def normalize_branch_value(branch: Any, config: dict[str, Any]) -> str:
|
||
b = str(branch).strip()
|
||
alias = config.get("branches", {}).get("alias", {})
|
||
if isinstance(alias, dict) and b in alias:
|
||
return str(alias.get(b, b)).strip()
|
||
return b
|
||
|
||
|
||
def infer_page_from_type(type_keyword: str, config: dict[str, Any]) -> str:
|
||
t = str(type_keyword).strip()
|
||
if not t:
|
||
return ""
|
||
|
||
type_map = config.get("type_matching", {})
|
||
if isinstance(type_map, dict):
|
||
direct = str(type_map.get(t, "")).strip()
|
||
if direct:
|
||
return direct
|
||
|
||
pairs = [(str(k), str(v)) for k, v in type_map.items()]
|
||
pairs.sort(key=lambda x: len(x[0]), reverse=True)
|
||
for k, v in pairs:
|
||
if k and k in t and v:
|
||
return v.strip()
|
||
|
||
for term in TERM_DEFS:
|
||
if t == str(term.get("type", "")):
|
||
return str(term.get("page", "")).strip()
|
||
return ""
|
||
|
||
|
||
def apply_record_overrides(
|
||
record: dict[str, Any],
|
||
updates: dict[str, Any],
|
||
config: dict[str, Any],
|
||
) -> dict[str, Any]:
|
||
out = dict(record)
|
||
if not isinstance(updates, dict):
|
||
return out
|
||
|
||
if "branch" in updates:
|
||
branch = normalize_branch_value(updates.get("branch", ""), config)
|
||
if branch:
|
||
out["branch"] = branch
|
||
|
||
if "amount" in updates:
|
||
amount = normalize_amount_text(updates.get("amount", ""))
|
||
if amount:
|
||
out["amount"] = amount
|
||
|
||
if "type" in updates:
|
||
t = str(updates.get("type", "")).strip()
|
||
if t:
|
||
out["type"] = t
|
||
if "page" not in updates:
|
||
page = infer_page_from_type(t, config)
|
||
if page:
|
||
out["page"] = page
|
||
|
||
if "page" in updates:
|
||
page = str(updates.get("page", "")).strip()
|
||
if page:
|
||
out["page"] = page
|
||
|
||
if "status" in updates:
|
||
status = str(updates.get("status", "")).strip()
|
||
if status:
|
||
out["status"] = normalize_status_value(status, config)
|
||
|
||
return out
|
||
|
||
|
||
def validate_record_for_generation(record: dict[str, Any], config: dict[str, Any]) -> None:
|
||
branch = str(record.get("branch", "")).strip()
|
||
amount = normalize_amount_text(record.get("amount", ""))
|
||
type_keyword = str(record.get("type", "")).strip()
|
||
page = str(record.get("page", "")).strip()
|
||
status = str(record.get("status", "")).strip()
|
||
|
||
if not branch:
|
||
raise ValueError("branch is required")
|
||
allowed = config.get("branches", {}).get("allowed", [])
|
||
if isinstance(allowed, list) and allowed:
|
||
allow_set = {str(x).strip() for x in allowed}
|
||
if branch not in allow_set:
|
||
raise ValueError(f"branch not allowed: {branch}")
|
||
|
||
if not amount:
|
||
raise ValueError("amount is required")
|
||
record["amount"] = amount
|
||
|
||
if not type_keyword:
|
||
raise ValueError("type is required")
|
||
if not page:
|
||
page = infer_page_from_type(type_keyword, config)
|
||
if page:
|
||
record["page"] = page
|
||
page = str(record.get("page", "")).strip()
|
||
if not page:
|
||
raise ValueError("page is required")
|
||
|
||
pages = config.get("pages", {})
|
||
if isinstance(pages, dict) and page not in pages:
|
||
raise ValueError(f"invalid page: {page}")
|
||
|
||
if not status:
|
||
status = str(config.get("status_extraction", {}).get("fallback", "成功营销")).strip() or "成功营销"
|
||
valid_status = get_valid_status_list(config)
|
||
if valid_status and status not in valid_status:
|
||
raise ValueError(f"invalid status: {status}")
|
||
record["status"] = normalize_status_value(status, config)
|
||
|
||
|
||
def match_manual_rule(rule: dict[str, Any], source_line: str, normalized_line: str) -> bool:
|
||
if not isinstance(rule, dict) or not bool(rule.get("enabled", True)):
|
||
return False
|
||
keyword = str(rule.get("keyword", "")).strip()
|
||
if not keyword:
|
||
return False
|
||
mode = str(rule.get("match_mode", "normalized")).strip().lower()
|
||
if mode == "source":
|
||
return keyword in source_line
|
||
if mode == "both":
|
||
return keyword in source_line or keyword in normalized_line
|
||
return keyword in normalized_line
|
||
|
||
|
||
def apply_manual_rules_to_record(
|
||
record: dict[str, Any],
|
||
*,
|
||
source_line: str,
|
||
normalized_line: str,
|
||
rules: list[dict[str, Any]],
|
||
config: dict[str, Any],
|
||
) -> tuple[dict[str, Any], list[str]]:
|
||
if not rules:
|
||
return record, []
|
||
|
||
out = dict(record)
|
||
hit_ids: list[str] = []
|
||
for rule in rules:
|
||
if not match_manual_rule(rule, source_line, normalized_line):
|
||
continue
|
||
updates = rule.get("updates", {})
|
||
if not isinstance(updates, dict):
|
||
continue
|
||
try:
|
||
out = apply_record_overrides(out, updates, config)
|
||
except Exception:
|
||
continue
|
||
hit_ids.append(str(rule.get("id", "")))
|
||
return out, [x for x in hit_ids if x]
|
||
|
||
|
||
def save_or_update_manual_rule(
|
||
*,
|
||
keyword: str,
|
||
updates: dict[str, Any],
|
||
note: str = "",
|
||
match_mode: str = "normalized",
|
||
) -> dict[str, Any]:
|
||
kw = re.sub(r"\s+", "", str(keyword).strip())
|
||
if not kw:
|
||
raise ValueError("rule keyword is required")
|
||
if not isinstance(updates, dict) or not updates:
|
||
raise ValueError("rule updates is required")
|
||
|
||
normalized_updates: dict[str, Any] = {}
|
||
for k in ("branch", "type", "page", "status", "amount"):
|
||
if k in updates:
|
||
v = str(updates.get(k, "")).strip()
|
||
if v:
|
||
normalized_updates[k] = v
|
||
if not normalized_updates:
|
||
raise ValueError("rule updates is empty")
|
||
|
||
mode = str(match_mode or "normalized").strip().lower()
|
||
if mode not in {"source", "normalized", "both"}:
|
||
mode = "normalized"
|
||
|
||
rules = load_manual_rules()
|
||
now = datetime.now().isoformat(timespec="seconds")
|
||
for rule in rules:
|
||
if not isinstance(rule, dict):
|
||
continue
|
||
if str(rule.get("keyword", "")).strip() != kw:
|
||
continue
|
||
if str(rule.get("match_mode", "normalized")).strip().lower() != mode:
|
||
continue
|
||
if rule.get("updates") != normalized_updates:
|
||
continue
|
||
rule["updated_at"] = now
|
||
if note:
|
||
rule["note"] = note
|
||
save_manual_rules(rules)
|
||
return dict(rule)
|
||
|
||
item = {
|
||
"id": uuid.uuid4().hex[:12],
|
||
"keyword": kw,
|
||
"match_mode": mode,
|
||
"updates": normalized_updates,
|
||
"enabled": True,
|
||
"created_at": now,
|
||
"updated_at": now,
|
||
}
|
||
if note:
|
||
item["note"] = note
|
||
rules.append(item)
|
||
save_manual_rules(rules)
|
||
return dict(item)
|
||
|
||
|
||
def infer_correction_rule_keyword(
|
||
*,
|
||
source_line: str,
|
||
normalized_line: str,
|
||
corrected_record: dict[str, Any],
|
||
) -> str:
|
||
status = str(corrected_record.get("status", "")).strip()
|
||
if status and (status in normalized_line or status in source_line):
|
||
return status
|
||
|
||
t = str(corrected_record.get("type", "")).strip()
|
||
if t and (t in normalized_line or t in source_line) and len(t) <= 8:
|
||
return t
|
||
|
||
if normalized_line:
|
||
return normalized_line[:24]
|
||
return re.sub(r"\s+", "", source_line)[:24]
|
||
|
||
|
||
def parse_records(
|
||
raw_text: str,
|
||
config: dict[str, Any],
|
||
history: list[dict[str, Any]],
|
||
insurance_year_choice: str | None = None,
|
||
insurance_year_choices: dict[str, str] | None = None,
|
||
) -> dict[str, Any]:
|
||
relay_cfg = config.get("relay_handling", {})
|
||
parse_rules = relay_cfg.get("parse_rules", {}) if isinstance(relay_cfg, dict) else {}
|
||
trigger = str(relay_cfg.get("trigger_keyword", "#接龙"))
|
||
line_pattern = str(parse_rules.get("line_pattern", r"^\d+、\s*"))
|
||
skip_lines = parse_rules.get("skip_lines", [])
|
||
skip_lines = [str(x) for x in skip_lines] if isinstance(skip_lines, list) else []
|
||
|
||
has_trigger = trigger in raw_text
|
||
lines = [line.strip() for line in raw_text.splitlines() if line.strip()]
|
||
|
||
filtered_lines: list[dict[str, str]] = []
|
||
skipped: list[dict[str, str]] = []
|
||
for line in lines:
|
||
source_line = line.strip()
|
||
if trigger and trigger in source_line and source_line == trigger:
|
||
continue
|
||
|
||
normalized = normalize_line(source_line, line_pattern)
|
||
if not normalized:
|
||
continue
|
||
|
||
if is_date_header_line(normalized):
|
||
skipped.append({"line": source_line, "reason": "skip_line_rule"})
|
||
continue
|
||
|
||
if any(token and token in normalized for token in skip_lines):
|
||
skipped.append({"line": source_line, "reason": "skip_line_rule"})
|
||
continue
|
||
|
||
filtered_lines.append(
|
||
{
|
||
"source_line": source_line,
|
||
"normalized_line": normalized,
|
||
"line_serial": extract_line_serial(source_line, line_pattern),
|
||
}
|
||
)
|
||
|
||
if not filtered_lines and raw_text.strip():
|
||
source_line = raw_text.strip()
|
||
normalized_single = normalize_line(source_line, line_pattern)
|
||
if normalized_single:
|
||
filtered_lines = [
|
||
{
|
||
"source_line": source_line,
|
||
"normalized_line": normalized_single,
|
||
"line_serial": extract_line_serial(source_line, line_pattern),
|
||
}
|
||
]
|
||
|
||
branches_cfg = config.get("branches", {})
|
||
allowed_branches = set(str(x) for x in branches_cfg.get("allowed", [])) if isinstance(branches_cfg, dict) else set()
|
||
|
||
dedup_cfg = relay_cfg.get("dedup", {}) if isinstance(relay_cfg, dict) else {}
|
||
key_fields = dedup_cfg.get("key_fields", ["branch", "amount", "type"])
|
||
if not isinstance(key_fields, list) or not key_fields:
|
||
key_fields = ["branch", "amount", "type"]
|
||
key_fields = [str(k) for k in key_fields]
|
||
insurance_year_map = dict(insurance_year_choices) if isinstance(insurance_year_choices, dict) else {}
|
||
|
||
history_lookup = build_history_lookup(history, key_fields)
|
||
history_keys = set(history_lookup.keys())
|
||
history_signature_lookup = build_history_signature_lookup(history)
|
||
history_signature_keys = set(history_signature_lookup.keys())
|
||
batch_keys: set[str] = set()
|
||
batch_signature_keys: set[str] = set()
|
||
suppressed_lookup = build_skip_suppression_lookup(list_skip_suppressions())
|
||
manual_rules = load_manual_rules()
|
||
|
||
records: list[dict[str, Any]] = []
|
||
new_records: list[dict[str, Any]] = []
|
||
pending_insurance_records: list[dict[str, Any]] = []
|
||
duplicate_records: list[dict[str, Any]] = []
|
||
|
||
for line_item in filtered_lines:
|
||
source_line = str(line_item.get("source_line", ""))
|
||
normalized_line = str(line_item.get("normalized_line", ""))
|
||
line_serial = str(line_item.get("line_serial", "")).strip()
|
||
normalized_signature = line_signature(normalized_line)
|
||
|
||
branch_raw, branch = extract_branch(normalized_line, config)
|
||
if not branch:
|
||
skipped.append({"line": source_line, "reason": "branch_not_found"})
|
||
continue
|
||
|
||
if allowed_branches and branch not in allowed_branches:
|
||
skipped.append({"line": source_line, "reason": f"branch_not_allowed:{branch}"})
|
||
continue
|
||
|
||
status = normalize_status_value(extract_status(normalized_line, config), config)
|
||
products = detect_products(normalized_line, config, insurance_year_choice)
|
||
if not products:
|
||
if is_demand_deposit_only_line(normalized_line):
|
||
skipped.append({"line": source_line, "reason": "demand_deposit_not_generate"})
|
||
else:
|
||
skipped.append({"line": source_line, "reason": "type_not_found"})
|
||
continue
|
||
|
||
for product_index, product in enumerate(products, start=1):
|
||
amount = str(product.get("amount", "")).strip()
|
||
if not amount:
|
||
skipped.append({"line": source_line, "reason": "amount_not_found"})
|
||
continue
|
||
|
||
type_keyword = str(product.get("type", "")).strip()
|
||
page = str(product.get("page", "")).strip()
|
||
if not type_keyword or not page:
|
||
skipped.append({"line": source_line, "reason": "type_not_found"})
|
||
continue
|
||
|
||
record = {
|
||
"source_line": source_line,
|
||
"raw_text": normalized_line,
|
||
"branch_raw": branch_raw,
|
||
"branch": branch,
|
||
"amount": amount,
|
||
"type": type_keyword,
|
||
"page": page,
|
||
"status": status,
|
||
"needs_insurance_year": bool(product.get("needs_insurance_year", False)),
|
||
}
|
||
if line_serial:
|
||
record["line_serial"] = line_serial
|
||
if normalized_signature:
|
||
record["line_signature"] = normalized_signature
|
||
record["line_product_index"] = str(product_index)
|
||
if record["needs_insurance_year"]:
|
||
choice_key = build_insurance_choice_key(
|
||
source_line=source_line,
|
||
raw_text=normalized_line,
|
||
branch=branch,
|
||
amount=amount,
|
||
page=page,
|
||
item_index=product_index,
|
||
)
|
||
record["insurance_choice_key"] = choice_key
|
||
selected_year = pick_insurance_year_for_record(
|
||
choice_key=choice_key,
|
||
source_line=source_line,
|
||
raw_text=normalized_line,
|
||
branch=branch,
|
||
amount=amount,
|
||
year_map=insurance_year_map,
|
||
default_year=insurance_year_choice,
|
||
)
|
||
if selected_year in {"3", "5"}:
|
||
record["type"] = f"{selected_year}年交"
|
||
record["needs_insurance_year"] = False
|
||
record["insurance_year"] = selected_year
|
||
record, hit_rule_ids = apply_manual_rules_to_record(
|
||
record,
|
||
source_line=source_line,
|
||
normalized_line=normalized_line,
|
||
rules=manual_rules,
|
||
config=config,
|
||
)
|
||
record["amount"] = normalize_amount_text(record.get("amount", ""))
|
||
record["status"] = normalize_status_value(str(record.get("status", "")), config)
|
||
if hit_rule_ids:
|
||
record["applied_rule_ids"] = hit_rule_ids
|
||
if record["needs_insurance_year"]:
|
||
forced_year = normalize_insurance_year(record.get("type"))
|
||
if forced_year in {"3", "5"}:
|
||
record["type"] = f"{forced_year}年交"
|
||
record["needs_insurance_year"] = False
|
||
record["insurance_year"] = forced_year
|
||
|
||
if not str(record.get("amount", "")).strip():
|
||
skipped.append({"line": source_line, "reason": "amount_not_found"})
|
||
continue
|
||
if not str(record.get("page", "")).strip():
|
||
inferred_page = infer_page_from_type(str(record.get("type", "")), config)
|
||
if inferred_page:
|
||
record["page"] = inferred_page
|
||
pages = config.get("pages", {})
|
||
if isinstance(pages, dict) and str(record.get("page", "")).strip() not in pages:
|
||
skipped.append({"line": source_line, "reason": "type_not_found"})
|
||
continue
|
||
if allowed_branches and str(record.get("branch", "")).strip() not in allowed_branches:
|
||
skipped.append({"line": source_line, "reason": f"branch_not_allowed:{record.get('branch', '')}"})
|
||
continue
|
||
|
||
records.append(record)
|
||
|
||
dedup_key = key_for_record(record, key_fields)
|
||
record["dedup_key"] = dedup_key
|
||
signature_key = signature_key_for_record(record)
|
||
if signature_key:
|
||
record["signature_key"] = signature_key
|
||
|
||
history_record = history_lookup.get(dedup_key)
|
||
if history_record is None and signature_key:
|
||
history_record = history_signature_lookup.get(signature_key)
|
||
|
||
if history_record is not None or dedup_key in history_keys or (signature_key and signature_key in history_signature_keys):
|
||
dup = dict(record)
|
||
dup["duplicate_reason"] = "history_duplicate"
|
||
attach_duplicate_download_info(dup, history_record, config)
|
||
duplicate_records.append(dup)
|
||
continue
|
||
|
||
if dedup_key in batch_keys or (signature_key and signature_key in batch_signature_keys):
|
||
dup = dict(record)
|
||
dup["duplicate_reason"] = "input_duplicate"
|
||
duplicate_records.append(dup)
|
||
continue
|
||
|
||
# Insurance without explicit 3/5 year should ask user only when truly pending.
|
||
# If this line was manually suppressed in skip rules, keep it out of prompt queue.
|
||
if record["needs_insurance_year"]:
|
||
if is_skip_item_suppressed(source_line, "insurance_year_pending", suppressed_lookup):
|
||
batch_keys.add(dedup_key)
|
||
if signature_key:
|
||
batch_signature_keys.add(signature_key)
|
||
continue
|
||
|
||
pending = dict(record)
|
||
pending["insurance_options"] = ["3", "5"]
|
||
pending_insurance_records.append(pending)
|
||
batch_keys.add(dedup_key)
|
||
if signature_key:
|
||
batch_signature_keys.add(signature_key)
|
||
continue
|
||
|
||
batch_keys.add(dedup_key)
|
||
if signature_key:
|
||
batch_signature_keys.add(signature_key)
|
||
new_records.append(record)
|
||
|
||
for i, rec in enumerate(new_records, start=1):
|
||
rec["output_file"] = render_output_filename(config, rec, i)
|
||
|
||
visible_skipped: list[dict[str, str]] = []
|
||
for item in skipped:
|
||
line_text = str(item.get("line", ""))
|
||
reason_text = str(item.get("reason", ""))
|
||
if is_skip_item_suppressed(line_text, reason_text, suppressed_lookup):
|
||
continue
|
||
visible_skipped.append(item)
|
||
|
||
summary = {
|
||
"input_lines": len(lines),
|
||
"parsed": len(records),
|
||
"new": len(new_records),
|
||
"duplicate": len(duplicate_records),
|
||
"skipped": len(visible_skipped),
|
||
"insurance_pending": len(pending_insurance_records),
|
||
}
|
||
|
||
return {
|
||
"has_trigger": has_trigger,
|
||
"records": records,
|
||
"new_records": new_records,
|
||
"pending_insurance_records": pending_insurance_records,
|
||
"duplicate_records": duplicate_records,
|
||
"skipped": visible_skipped,
|
||
"summary": summary,
|
||
"dedup_key_fields": key_fields,
|
||
"needs_insurance_choice": len(pending_insurance_records) > 0,
|
||
}
|
||
|
||
|
||
def append_new_history(
|
||
history_path: Path,
|
||
current_history: list[dict[str, Any]],
|
||
records: list[dict[str, Any]],
|
||
key_fields: list[str],
|
||
) -> dict[str, Any]:
|
||
existing = {key_for_record(r, key_fields) for r in current_history}
|
||
to_add = []
|
||
now = datetime.now().isoformat(timespec="seconds")
|
||
|
||
for rec in records:
|
||
if not isinstance(rec, dict):
|
||
continue
|
||
k = key_for_record(rec, key_fields)
|
||
if not k or k in existing:
|
||
continue
|
||
|
||
item = dict(rec)
|
||
item.pop("download_token", None)
|
||
item.pop("download_url", None)
|
||
item["created_at"] = now
|
||
to_add.append(item)
|
||
existing.add(k)
|
||
|
||
merged = current_history + to_add
|
||
save_history(history_path, merged)
|
||
|
||
return {
|
||
"added": len(to_add),
|
||
"total": len(merged),
|
||
}
|
||
|
||
|
||
def upsert_history_records(
|
||
history_path: Path,
|
||
current_history: list[dict[str, Any]],
|
||
records: list[dict[str, Any]],
|
||
key_fields: list[str],
|
||
) -> dict[str, Any]:
|
||
merged = [x for x in current_history if isinstance(x, dict)]
|
||
index_map: dict[str, int] = {}
|
||
for i, item in enumerate(merged):
|
||
k = key_for_record(item, key_fields)
|
||
if k:
|
||
index_map[k] = i
|
||
|
||
now = datetime.now().isoformat(timespec="seconds")
|
||
added = 0
|
||
replaced = 0
|
||
for rec in records:
|
||
if not isinstance(rec, dict):
|
||
continue
|
||
item = dict(rec)
|
||
item.pop("download_token", None)
|
||
item.pop("download_url", None)
|
||
item["created_at"] = now
|
||
k = key_for_record(item, key_fields)
|
||
if not k:
|
||
continue
|
||
if k in index_map:
|
||
merged[index_map[k]] = item
|
||
replaced += 1
|
||
else:
|
||
index_map[k] = len(merged)
|
||
merged.append(item)
|
||
added += 1
|
||
|
||
save_history(history_path, merged)
|
||
return {
|
||
"added": added,
|
||
"replaced": replaced,
|
||
"total": len(merged),
|
||
}
|
||
|
||
|
||
def page_to_slide_index(page_name: str) -> int:
|
||
m = re.match(r"^page_(\d+)$", page_name)
|
||
if not m:
|
||
raise ValueError(f"invalid page name: {page_name}")
|
||
return int(m.group(1))
|
||
|
||
|
||
def safe_filename(name: str) -> str:
|
||
s = re.sub(r"[\\/:*?\"<>|\r\n]+", "_", name).strip()
|
||
s = re.sub(r"\s+", "_", s)
|
||
return s or f"file_{uuid.uuid4().hex[:8]}"
|
||
|
||
|
||
def amount_to_digits(amount: str) -> str:
|
||
v = floor_amount_number(amount)
|
||
if v is None:
|
||
t = str(amount).strip()
|
||
return t.replace("万元", "").replace("万", "")
|
||
return str(v)
|
||
|
||
|
||
def pick_paragraph_index(paragraph_count: int, configured_index: int) -> int:
|
||
if paragraph_count <= 0:
|
||
return 0
|
||
if 0 <= configured_index < paragraph_count:
|
||
return configured_index
|
||
# Compatible with 1-based config values.
|
||
one_based = configured_index - 1
|
||
if 0 <= one_based < paragraph_count:
|
||
return one_based
|
||
return 0
|
||
|
||
|
||
def paragraph_plain_text(para: Any) -> str:
|
||
if getattr(para, "runs", None):
|
||
return "".join((run.text or "") for run in para.runs)
|
||
return str(getattr(para, "text", "") or "")
|
||
|
||
|
||
def set_paragraph_text(para: Any, text: str) -> None:
|
||
if getattr(para, "runs", None):
|
||
para.runs[0].text = text
|
||
for run in para.runs[1:]:
|
||
run.text = ""
|
||
return
|
||
para.text = text
|
||
|
||
|
||
def paragraph_has_branch_marker(para: Any) -> bool:
|
||
txt = paragraph_plain_text(para).strip()
|
||
return bool(txt) and ("营业所" in txt)
|
||
|
||
|
||
def replace_company_placeholder_text(text: str, company_replacement: str) -> str:
|
||
company_text = str(company_replacement or "").strip()
|
||
if not company_text:
|
||
return str(text or "")
|
||
return re.sub(r"([XxXx**]+)\s*(?=分公司)", company_text, str(text or ""), count=1)
|
||
|
||
|
||
def build_branch_text(
|
||
original_text: str,
|
||
branch: str,
|
||
fallback_text: str,
|
||
company_replacement: str = "",
|
||
) -> str:
|
||
text = str(original_text or "")
|
||
if not text.strip():
|
||
return fallback_text
|
||
|
||
# Some templates use placeholder tokens like "XX/**" before "分公司".
|
||
# Replace them first to avoid rendering "XX分公司".
|
||
text = replace_company_placeholder_text(text, company_replacement)
|
||
|
||
# Keep the template prefix (e.g. "永州市道县分公司"), only replace the branch part.
|
||
m = re.search(r"(分公司)([^|,。,;;\s]*)营业所", text)
|
||
if m:
|
||
return f"{text[:m.start(2)]}{branch}营业所{text[m.end():]}"
|
||
|
||
m = re.search(r"([^|,。,;;\s]*)营业所", text)
|
||
if m:
|
||
return f"{text[:m.start(1)]}{branch}营业所{text[m.end():]}"
|
||
|
||
return fallback_text
|
||
|
||
|
||
def replace_branch_in_slide(slide: Any, branch: str, config: dict[str, Any]) -> None:
|
||
branch_cfg = config.get("replace_algorithm", {}).get("branch", {})
|
||
location_cfg = config.get("branches", {}).get("template_location", {})
|
||
|
||
shape_index = int(branch_cfg.get("shape_index", location_cfg.get("shape_index", 3)))
|
||
paragraph_index = int(branch_cfg.get("paragraph_index", location_cfg.get("paragraph_index", 1)))
|
||
fmt = str(branch_cfg.get("format", location_cfg.get("display_format", "{branch}营业所")))
|
||
company_replacement = str(
|
||
branch_cfg.get("company_replacement", location_cfg.get("company_replacement", ""))
|
||
).strip()
|
||
new_text = fmt.format(branch=branch)
|
||
|
||
target_para = None
|
||
fallback_para = None
|
||
target_shape = None
|
||
cfg_paragraphs = []
|
||
|
||
if 0 <= shape_index < len(slide.shapes):
|
||
cfg_shape = slide.shapes[shape_index]
|
||
if getattr(cfg_shape, "has_text_frame", False):
|
||
cfg_paragraphs = cfg_shape.text_frame.paragraphs
|
||
if cfg_paragraphs:
|
||
p_index = pick_paragraph_index(len(cfg_paragraphs), paragraph_index)
|
||
fallback_para = cfg_paragraphs[p_index]
|
||
if paragraph_has_branch_marker(cfg_paragraphs[p_index]):
|
||
target_para = cfg_paragraphs[p_index]
|
||
target_shape = cfg_shape
|
||
else:
|
||
for para in cfg_paragraphs:
|
||
if paragraph_has_branch_marker(para):
|
||
target_para = para
|
||
target_shape = cfg_shape
|
||
break
|
||
|
||
if target_para is None:
|
||
for shape in slide.shapes:
|
||
if not getattr(shape, "has_text_frame", False):
|
||
continue
|
||
paragraphs = shape.text_frame.paragraphs
|
||
for para in paragraphs:
|
||
if paragraph_has_branch_marker(para):
|
||
target_para = para
|
||
target_shape = shape
|
||
break
|
||
if target_para is not None:
|
||
break
|
||
|
||
if target_para is not None:
|
||
original_text = paragraph_plain_text(target_para)
|
||
set_paragraph_text(
|
||
target_para,
|
||
build_branch_text(original_text, branch, new_text, company_replacement),
|
||
)
|
||
if company_replacement and target_shape is not None and getattr(target_shape, "has_text_frame", False):
|
||
for para in target_shape.text_frame.paragraphs:
|
||
old_text = paragraph_plain_text(para)
|
||
new_company_text = replace_company_placeholder_text(old_text, company_replacement)
|
||
if new_company_text != old_text:
|
||
set_paragraph_text(para, new_company_text)
|
||
return
|
||
|
||
if fallback_para is not None:
|
||
set_paragraph_text(fallback_para, new_text)
|
||
|
||
|
||
def replace_amount_in_slide(slide: Any, shape_index: int, amount: str) -> None:
|
||
if shape_index < 0 or shape_index >= len(slide.shapes):
|
||
return
|
||
|
||
shape = slide.shapes[shape_index]
|
||
if not getattr(shape, "has_text_frame", False):
|
||
return
|
||
|
||
amount_digits = amount_to_digits(amount)
|
||
candidates: list[tuple[Any, int, str]] = []
|
||
paragraphs = shape.text_frame.paragraphs
|
||
|
||
for para in paragraphs:
|
||
runs = list(getattr(para, "runs", []) or [])
|
||
for idx, run in enumerate(runs):
|
||
t = str(getattr(run, "text", "") or "").strip()
|
||
if re.fullmatch(r"\d+(?:\.\d+)?", t) or re.fullmatch(r"[\**]+", t):
|
||
candidates.append((para, idx, t))
|
||
|
||
if not candidates:
|
||
return
|
||
|
||
def next_nonempty_text(para: Any, start_idx: int) -> str:
|
||
runs = list(getattr(para, "runs", []) or [])
|
||
for j in range(start_idx + 1, len(runs)):
|
||
txt = str(getattr(runs[j], "text", "") or "").strip()
|
||
if txt:
|
||
return txt
|
||
return ""
|
||
|
||
def prev_nonempty_text(para: Any, start_idx: int) -> str:
|
||
runs = list(getattr(para, "runs", []) or [])
|
||
for j in range(start_idx - 1, -1, -1):
|
||
txt = str(getattr(runs[j], "text", "") or "").strip()
|
||
if txt:
|
||
return txt
|
||
return ""
|
||
|
||
preferred: list[tuple[Any, int, str]] = []
|
||
for para, idx, _ in candidates:
|
||
nxt = next_nonempty_text(para, idx)
|
||
prv = prev_nonempty_text(para, idx)
|
||
# Prefer the amount token that is adjacent to "万/万元".
|
||
if ("万" in nxt) or ("万元" in nxt) or ("万" in prv) or ("万元" in prv):
|
||
preferred.append((para, idx, ""))
|
||
|
||
targets = preferred if preferred else [candidates[-1]]
|
||
for para, idx, _ in targets:
|
||
runs = list(getattr(para, "runs", []) or [])
|
||
if 0 <= idx < len(runs):
|
||
runs[idx].text = amount_digits
|
||
|
||
|
||
def replace_status_in_slide(
|
||
slide: Any,
|
||
shape_index: int,
|
||
status: str,
|
||
page_name: str,
|
||
config: dict[str, Any],
|
||
) -> None:
|
||
if shape_index < 0 or shape_index >= len(slide.shapes):
|
||
return
|
||
|
||
shape = slide.shapes[shape_index]
|
||
if not getattr(shape, "has_text_frame", False):
|
||
return
|
||
|
||
normalized_status = normalize_status_value(status, config)
|
||
page_cfg = config.get("pages", {}).get(page_name, {})
|
||
has_bracket = bool(page_cfg.get("has_bracket", False)) if isinstance(page_cfg, dict) else False
|
||
target_text = f"({normalized_status})" if has_bracket else normalized_status
|
||
valid_status = get_valid_status_list(config)
|
||
|
||
paragraphs = shape.text_frame.paragraphs
|
||
if not paragraphs:
|
||
shape.text_frame.text = target_text
|
||
return
|
||
|
||
target_para = None
|
||
for para in paragraphs:
|
||
para_text = "".join(run.text for run in para.runs).strip() if para.runs else para.text.strip()
|
||
if not para_text:
|
||
continue
|
||
if any(s in para_text for s in valid_status) or ("成功营销" in para_text) or ("(" in para_text and ")" in para_text):
|
||
target_para = para
|
||
break
|
||
|
||
if target_para is None:
|
||
for para in paragraphs:
|
||
para_text = "".join(run.text for run in para.runs).strip() if para.runs else para.text.strip()
|
||
if para_text:
|
||
target_para = para
|
||
break
|
||
if target_para is None:
|
||
target_para = paragraphs[0]
|
||
|
||
if target_para.runs:
|
||
replaced = False
|
||
|
||
# 优先替换已存在的状态 run,保持模板字体/字号完全不变。
|
||
for run in target_para.runs:
|
||
raw = run.text or ""
|
||
trimmed = raw.strip()
|
||
if not raw:
|
||
continue
|
||
if trimmed in valid_status or trimmed == "成功营销":
|
||
run.text = raw.replace(trimmed, normalized_status, 1)
|
||
replaced = True
|
||
break
|
||
|
||
if not replaced and has_bracket:
|
||
for run in target_para.runs:
|
||
raw = run.text or ""
|
||
if "(" in raw and ")" in raw:
|
||
run.text = re.sub(r"([^)]*)", f"({normalized_status})", raw, count=1)
|
||
replaced = True
|
||
break
|
||
|
||
if not replaced:
|
||
target_para.runs[0].text = target_text
|
||
for run in target_para.runs[1:]:
|
||
run.text = ""
|
||
else:
|
||
target_para.text = target_text
|
||
|
||
if has_bracket:
|
||
target_para.alignment = PP_ALIGN.CENTER
|
||
shape.text_frame.word_wrap = False
|
||
|
||
|
||
def replace_page_display_name_in_slide(slide: Any, page_name: str, config: dict[str, Any]) -> None:
|
||
pages = config.get("pages", {})
|
||
if not isinstance(pages, dict):
|
||
return
|
||
page_cfg = pages.get(page_name, {})
|
||
if not isinstance(page_cfg, dict):
|
||
return
|
||
display_name = str(page_cfg.get("display_name", "")).strip()
|
||
if not display_name:
|
||
return
|
||
|
||
aliases = page_cfg.get("display_aliases", [])
|
||
alias_list: list[str] = [display_name]
|
||
if isinstance(aliases, list):
|
||
alias_list.extend([str(x).strip() for x in aliases if str(x).strip()])
|
||
if page_name == "page_5":
|
||
# 兼容旧模板文案。
|
||
alias_list.extend(["两三期理财", "两三年期理财"])
|
||
alias_list = [x for x in dict.fromkeys(alias_list) if x]
|
||
|
||
for shape in slide.shapes:
|
||
if not getattr(shape, "has_text_frame", False):
|
||
continue
|
||
paragraphs = shape.text_frame.paragraphs
|
||
for para in paragraphs:
|
||
raw_text = paragraph_plain_text(para)
|
||
compact_text = normalize_skip_line(raw_text)
|
||
if not compact_text:
|
||
continue
|
||
for alias in alias_list:
|
||
alias_compact = normalize_skip_line(alias)
|
||
if not alias_compact or alias_compact not in compact_text:
|
||
continue
|
||
|
||
if compact_text == alias_compact:
|
||
set_paragraph_text(para, display_name)
|
||
elif alias in raw_text:
|
||
set_paragraph_text(para, raw_text.replace(alias, display_name, 1))
|
||
elif len(compact_text) <= 24:
|
||
set_paragraph_text(para, compact_text.replace(alias_compact, display_name, 1))
|
||
else:
|
||
continue
|
||
|
||
# 避免“多一个字后自动折行”。
|
||
shape.text_frame.word_wrap = False
|
||
break
|
||
|
||
|
||
def keep_only_target_slide(prs: Presentation, keep_index: int) -> bool:
|
||
try:
|
||
sld_id_list = prs.slides._sldIdLst # type: ignore[attr-defined]
|
||
total = len(prs.slides)
|
||
if keep_index < 0 or keep_index >= total:
|
||
return False
|
||
for idx in range(total - 1, -1, -1):
|
||
if idx == keep_index:
|
||
continue
|
||
rel_id = sld_id_list[idx].rId
|
||
prs.part.drop_rel(rel_id)
|
||
del sld_id_list[idx]
|
||
return True
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
def is_single_slide_output_enabled(config: dict[str, Any]) -> bool:
|
||
perf_cfg = config.get("performance", {})
|
||
if isinstance(perf_cfg, dict):
|
||
return bool(perf_cfg.get("single_slide_output", True))
|
||
return True
|
||
|
||
|
||
def resolve_generation_strategy(config: dict[str, Any], total_records: int) -> str:
|
||
perf_cfg = config.get("performance", {})
|
||
if not isinstance(perf_cfg, dict):
|
||
return "legacy"
|
||
|
||
strategy = str(perf_cfg.get("generation_strategy", "page_template_cache")).strip().lower()
|
||
if strategy not in {"legacy", "page_template_cache"}:
|
||
strategy = "page_template_cache"
|
||
if strategy == "legacy":
|
||
return "legacy"
|
||
|
||
min_records = perf_cfg.get("template_cache_min_records", 2)
|
||
try:
|
||
min_records_int = int(min_records)
|
||
except Exception:
|
||
min_records_int = 2
|
||
if total_records < max(1, min_records_int):
|
||
return "legacy"
|
||
|
||
if not is_single_slide_output_enabled(config):
|
||
return "legacy"
|
||
|
||
return "page_template_cache"
|
||
|
||
|
||
def _resolve_perf_section(config: dict[str, Any]) -> dict[str, Any]:
|
||
perf_cfg = config.get("performance", {})
|
||
return perf_cfg if isinstance(perf_cfg, dict) else {}
|
||
|
||
|
||
def resolve_single_generation_mode(config: dict[str, Any]) -> bool:
|
||
perf_cfg = _resolve_perf_section(config)
|
||
return bool(perf_cfg.get("single_generation_mode", True))
|
||
|
||
|
||
def resolve_build_workers(config: dict[str, Any], total_records: int) -> int:
|
||
perf_cfg = _resolve_perf_section(config)
|
||
raw_workers = perf_cfg.get("max_build_workers", 1)
|
||
try:
|
||
workers = int(raw_workers)
|
||
except Exception:
|
||
workers = 1
|
||
workers = max(1, min(6, workers))
|
||
return max(1, min(workers, total_records, (os.cpu_count() or 2)))
|
||
|
||
|
||
def resolve_extract_workers(config: dict[str, Any]) -> int:
|
||
perf_cfg = _resolve_perf_section(config)
|
||
raw_workers = perf_cfg.get("max_extract_workers", 1)
|
||
try:
|
||
workers = int(raw_workers)
|
||
except Exception:
|
||
workers = 1
|
||
return max(1, min(6, workers, (os.cpu_count() or 2)))
|
||
|
||
|
||
def resolve_memory_reclaim_options(config: dict[str, Any]) -> dict[str, bool]:
|
||
perf_cfg = _resolve_perf_section(config)
|
||
reclaim_cfg = perf_cfg.get("memory_reclaim", {})
|
||
if not isinstance(reclaim_cfg, dict):
|
||
reclaim_cfg = {}
|
||
enabled = bool(reclaim_cfg.get("enabled", True))
|
||
return {
|
||
"enabled": enabled,
|
||
"gc_collect": enabled and bool(reclaim_cfg.get("gc_collect", True)),
|
||
"malloc_trim": enabled and bool(reclaim_cfg.get("malloc_trim", True)),
|
||
}
|
||
|
||
|
||
def _try_malloc_trim() -> bool:
|
||
global _MALLOC_TRIM_FN
|
||
if os.name != "posix":
|
||
return False
|
||
try:
|
||
if _MALLOC_TRIM_FN is None:
|
||
libc = ctypes.CDLL("libc.so.6")
|
||
fn = libc.malloc_trim
|
||
fn.argtypes = [ctypes.c_size_t]
|
||
fn.restype = ctypes.c_int
|
||
_MALLOC_TRIM_FN = fn
|
||
return bool(_MALLOC_TRIM_FN(0))
|
||
except Exception:
|
||
return False
|
||
|
||
|
||
def reclaim_runtime_memory(config: dict[str, Any]) -> None:
|
||
opts = resolve_memory_reclaim_options(config)
|
||
if not opts["enabled"]:
|
||
return
|
||
if opts["gc_collect"]:
|
||
gc.collect()
|
||
if opts["malloc_trim"]:
|
||
_try_malloc_trim()
|
||
|
||
|
||
def build_page_template_cache(
|
||
template_path: Path,
|
||
page_names: set[str],
|
||
) -> dict[str, bytes]:
|
||
cache: dict[str, bytes] = {}
|
||
for page_name in sorted(page_names):
|
||
prs = Presentation(str(template_path))
|
||
slide_index = page_to_slide_index(page_name)
|
||
if slide_index < 0 or slide_index >= len(prs.slides):
|
||
raise RuntimeError(f"slide index out of range for page={page_name}")
|
||
if not keep_only_target_slide(prs, slide_index):
|
||
raise RuntimeError(f"failed to prepare single-slide template for page={page_name}")
|
||
buffer = io.BytesIO()
|
||
prs.save(buffer)
|
||
cache[page_name] = buffer.getvalue()
|
||
return cache
|
||
|
||
|
||
def resolve_image_delivery_options(config: dict[str, Any]) -> dict[str, Any]:
|
||
perf_cfg = config.get("performance", {})
|
||
delivery_cfg = perf_cfg.get("image_delivery", {}) if isinstance(perf_cfg, dict) else {}
|
||
if not isinstance(delivery_cfg, dict):
|
||
delivery_cfg = {}
|
||
|
||
enabled = bool(delivery_cfg.get("enabled", True))
|
||
|
||
raw_max_kbps = delivery_cfg.get("max_kbps", 300)
|
||
try:
|
||
max_kbps = int(raw_max_kbps)
|
||
except Exception:
|
||
max_kbps = 300
|
||
max_kbps = max(0, max_kbps)
|
||
|
||
raw_chunk_kb = delivery_cfg.get("chunk_kb", 16)
|
||
try:
|
||
chunk_kb = int(raw_chunk_kb)
|
||
except Exception:
|
||
chunk_kb = 16
|
||
chunk_kb = max(4, min(256, chunk_kb))
|
||
|
||
if not enabled:
|
||
max_kbps = 0
|
||
|
||
return {
|
||
"enabled": enabled,
|
||
"max_kbps": max_kbps,
|
||
"chunk_size": chunk_kb * 1024,
|
||
}
|
||
|
||
|
||
def build_ppt_for_record(
|
||
template_path: Path,
|
||
config: dict[str, Any],
|
||
record: dict[str, Any],
|
||
out_pptx: Path,
|
||
page_template_cache: dict[str, bytes] | None = None,
|
||
) -> int:
|
||
page_name = str(record.get("page", ""))
|
||
cached_payload = page_template_cache.get(page_name) if isinstance(page_template_cache, dict) else None
|
||
if cached_payload:
|
||
prs = Presentation(io.BytesIO(cached_payload))
|
||
slide_index = 0
|
||
else:
|
||
prs = Presentation(str(template_path))
|
||
slide_index = page_to_slide_index(page_name)
|
||
if slide_index < 0 or slide_index >= len(prs.slides):
|
||
raise RuntimeError(f"slide index out of range for page={record.get('page')}")
|
||
|
||
slide = prs.slides[slide_index]
|
||
page_cfg = config.get("pages", {}).get(page_name, {})
|
||
amount_shape = int(page_cfg.get("amount_shape", 2))
|
||
status_shape = int(page_cfg.get("status_shape", 3))
|
||
normalized_status = normalize_status_value(str(record.get("status", "")), config)
|
||
|
||
replace_branch_in_slide(slide, str(record.get("branch", "")), config)
|
||
replace_amount_in_slide(slide, amount_shape, str(record.get("amount", "")))
|
||
replace_status_in_slide(slide, status_shape, normalized_status, page_name, config)
|
||
replace_page_display_name_in_slide(slide, page_name, config)
|
||
|
||
single_slide_output = is_single_slide_output_enabled(config)
|
||
export_page_number = slide_index + 1
|
||
if not cached_payload and single_slide_output and keep_only_target_slide(prs, slide_index):
|
||
export_page_number = 1
|
||
|
||
prs.save(str(out_pptx))
|
||
return export_page_number
|
||
|
||
|
||
def run_subprocess(args: list[str], timeout: int = 300) -> subprocess.CompletedProcess[str]:
|
||
return subprocess.run(
|
||
args,
|
||
stdout=subprocess.PIPE,
|
||
stderr=subprocess.PIPE,
|
||
text=True,
|
||
check=False,
|
||
timeout=timeout,
|
||
)
|
||
|
||
|
||
def use_local_office_converter() -> bool:
|
||
return bool(LOCAL_LIBREOFFICE_BIN and Path(str(LOCAL_LIBREOFFICE_BIN)).exists())
|
||
|
||
|
||
def use_local_pdftoppm() -> bool:
|
||
return bool(LOCAL_PDFTOPPM_BIN and Path(str(LOCAL_PDFTOPPM_BIN)).exists())
|
||
|
||
|
||
def ensure_converter_image() -> None:
|
||
global CONVERTER_IMAGE_READY
|
||
if use_local_office_converter():
|
||
CONVERTER_IMAGE_READY = True
|
||
return
|
||
if CONVERTER_IMAGE_READY:
|
||
return
|
||
|
||
with _CONVERTER_LOCK:
|
||
if CONVERTER_IMAGE_READY:
|
||
return
|
||
|
||
inspect = run_subprocess(["docker", "image", "inspect", CONVERTER_IMAGE], timeout=60)
|
||
if inspect.returncode != 0:
|
||
pull = run_subprocess(["docker", "pull", CONVERTER_IMAGE], timeout=900)
|
||
if pull.returncode != 0:
|
||
raise RuntimeError(
|
||
f"failed to pull docker image {CONVERTER_IMAGE}: {pull.stderr.strip() or pull.stdout.strip()}"
|
||
)
|
||
|
||
CONVERTER_IMAGE_READY = True
|
||
|
||
|
||
def prewarm_converter_image(background: bool = True) -> None:
|
||
def _task() -> None:
|
||
try:
|
||
ensure_converter_image()
|
||
print(f"Converter image ready: {CONVERTER_IMAGE}")
|
||
except Exception as exc:
|
||
print(f"Converter image prewarm failed: {exc}")
|
||
|
||
if background:
|
||
t = threading.Thread(target=_task, name="converter-prewarm", daemon=True)
|
||
t.start()
|
||
else:
|
||
_task()
|
||
|
||
|
||
def convert_pptx_to_pdf_batch(
|
||
job_dir: Path,
|
||
pptx_paths: list[Path],
|
||
progress_cb: Callable[[int, str, str], None] | None = None,
|
||
) -> list[Path]:
|
||
if not pptx_paths:
|
||
return []
|
||
|
||
pdf_paths = [p.with_suffix(".pdf") for p in pptx_paths]
|
||
total = len(pdf_paths)
|
||
start_ts = time.time()
|
||
last_percent = -1
|
||
last_done = -1
|
||
|
||
if use_local_office_converter():
|
||
args = [
|
||
str(LOCAL_LIBREOFFICE_BIN),
|
||
"--headless",
|
||
"--convert-to",
|
||
"pdf",
|
||
"--outdir",
|
||
str(job_dir),
|
||
]
|
||
args.extend([str(p) for p in pptx_paths])
|
||
else:
|
||
ensure_converter_image()
|
||
uid_gid = f"{os.getuid()}:{os.getgid()}"
|
||
workdir = str(job_dir)
|
||
args = [
|
||
"docker",
|
||
"run",
|
||
"--rm",
|
||
"--user",
|
||
uid_gid,
|
||
"-v",
|
||
f"{workdir}:/work",
|
||
CONVERTER_IMAGE,
|
||
"libreoffice",
|
||
"--headless",
|
||
"--convert-to",
|
||
"pdf",
|
||
"--outdir",
|
||
"/work",
|
||
]
|
||
args.extend([f"/work/{p.name}" for p in pptx_paths])
|
||
|
||
proc = subprocess.Popen(
|
||
args,
|
||
stdout=subprocess.PIPE,
|
||
stderr=subprocess.PIPE,
|
||
text=True,
|
||
)
|
||
while True:
|
||
ret = proc.poll()
|
||
|
||
done = 0
|
||
for p in pdf_paths:
|
||
if p.exists():
|
||
done += 1
|
||
|
||
# 70~84 之间滚动,避免长时间停在 70% 不动。
|
||
percent_by_done = int((done / max(total, 1)) * 14)
|
||
percent_by_time = min(13, int((time.time() - start_ts) / 2.0))
|
||
percent = min(84, 70 + max(percent_by_done, percent_by_time))
|
||
if progress_cb and (done != last_done or percent != last_percent):
|
||
progress_cb(percent, "PPT转PDF", f"已转 {done}/{total}")
|
||
last_done = done
|
||
last_percent = percent
|
||
|
||
if ret is not None:
|
||
break
|
||
time.sleep(0.35)
|
||
|
||
stdout, stderr = proc.communicate()
|
||
if proc.returncode != 0:
|
||
raise RuntimeError(f"pptx->pdf batch convert failed: {stderr.strip() or stdout.strip()}")
|
||
|
||
if progress_cb:
|
||
progress_cb(84, "PPT转PDF", f"已转 {total}/{total}")
|
||
|
||
missing = [str(p.name) for p in pdf_paths if not p.exists()]
|
||
if missing:
|
||
raise RuntimeError(f"pptx->pdf batch convert failed, missing outputs: {', '.join(missing[:5])}")
|
||
|
||
return pdf_paths
|
||
|
||
|
||
def extract_pdf_pages_to_png_batch(
|
||
tasks: list[dict[str, Any]],
|
||
config: dict[str, Any] | None = None,
|
||
progress_cb: Callable[[int, str, str], None] | None = None,
|
||
) -> None:
|
||
if not tasks:
|
||
return
|
||
|
||
first_pdf = Path(str(tasks[0]["pdf_path"]))
|
||
job_dir = first_pdf.parent
|
||
|
||
local_pdftoppm = use_local_pdftoppm()
|
||
map_file = job_dir / "_extract_map.nul"
|
||
with map_file.open("wb") as f:
|
||
for t in tasks:
|
||
if local_pdftoppm:
|
||
pdf_field = str(Path(str(t["pdf_path"])).resolve())
|
||
out_field = str(Path(str(t["png_path"])).with_suffix("").resolve())
|
||
else:
|
||
pdf_field = Path(str(t["pdf_path"])).name
|
||
out_field = Path(str(t["png_path"])).with_suffix("").name
|
||
pdf_name = pdf_field.encode("utf-8")
|
||
page_number = str(int(t["page_number"])).encode("utf-8")
|
||
out_prefix = out_field.encode("utf-8")
|
||
f.write(pdf_name + b"\0" + page_number + b"\0" + out_prefix + b"\0")
|
||
|
||
parallel_jobs = resolve_extract_workers(config or {})
|
||
if local_pdftoppm:
|
||
script = (
|
||
"set -e; "
|
||
f"xargs -0 -P {parallel_jobs} -n 3 sh -c "
|
||
"'pdf=\"$1\"; page=\"$2\"; out=\"$3\"; "
|
||
f"\"{LOCAL_PDFTOPPM_BIN}\" -f \"$page\" -singlefile -png \"$pdf\" \"$out\"' _ "
|
||
f"< {str(map_file)}"
|
||
)
|
||
args = ["sh", "-lc", script]
|
||
else:
|
||
ensure_converter_image()
|
||
uid_gid = f"{os.getuid()}:{os.getgid()}"
|
||
script = (
|
||
"set -e; "
|
||
f"xargs -0 -P {parallel_jobs} -n 3 sh -c "
|
||
"'pdf=\"$1\"; page=\"$2\"; out=\"$3\"; "
|
||
"pdftoppm -f \"$page\" -singlefile -png \"/work/$pdf\" \"/work/$out\"' _ "
|
||
"< /work/_extract_map.nul"
|
||
)
|
||
args = [
|
||
"docker",
|
||
"run",
|
||
"--rm",
|
||
"--user",
|
||
uid_gid,
|
||
"-v",
|
||
f"{str(job_dir)}:/work",
|
||
CONVERTER_IMAGE,
|
||
"sh",
|
||
"-lc",
|
||
script,
|
||
]
|
||
proc = subprocess.Popen(
|
||
args,
|
||
stdout=subprocess.PIPE,
|
||
stderr=subprocess.PIPE,
|
||
text=True,
|
||
)
|
||
total = len(tasks)
|
||
last_done = -1
|
||
last_percent = -1
|
||
while True:
|
||
ret = proc.poll()
|
||
done = 0
|
||
for t in tasks:
|
||
if Path(str(t["png_path"])).exists():
|
||
done += 1
|
||
percent = min(94, 85 + int((done / max(total, 1)) * 9))
|
||
if progress_cb and (done != last_done or percent != last_percent):
|
||
progress_cb(percent, "PDF转PNG", f"已导出 {done}/{total}")
|
||
last_done = done
|
||
last_percent = percent
|
||
if ret is not None:
|
||
break
|
||
time.sleep(0.35)
|
||
|
||
stdout, stderr = proc.communicate()
|
||
map_file.unlink(missing_ok=True)
|
||
if proc.returncode != 0:
|
||
raise RuntimeError(f"pdf->png batch extract failed: {stderr.strip() or stdout.strip()}")
|
||
|
||
missing_png = [str(Path(str(t["png_path"])).name) for t in tasks if not Path(str(t["png_path"])).exists()]
|
||
if missing_png:
|
||
raise RuntimeError(f"pdf->png batch extract failed, missing outputs: {', '.join(missing_png[:5])}")
|
||
|
||
|
||
def cleanup_download_cache() -> None:
|
||
now = time.time()
|
||
to_delete: list[str] = []
|
||
|
||
for token, meta in DOWNLOAD_CACHE.items():
|
||
created_at = float(meta.get("created_at", 0))
|
||
if now - created_at > 3600: # 1 hour
|
||
to_delete.append(token)
|
||
|
||
for token in to_delete:
|
||
DOWNLOAD_CACHE.pop(token, None)
|
||
|
||
|
||
def register_download(file_path: Path, content_type: str = "application/octet-stream") -> str:
|
||
token = uuid.uuid4().hex
|
||
with _DOWNLOAD_LOCK:
|
||
cleanup_download_cache()
|
||
DOWNLOAD_CACHE[token] = {
|
||
"file_path": str(file_path),
|
||
"filename": file_path.name,
|
||
"content_type": content_type,
|
||
"created_at": time.time(),
|
||
}
|
||
return token
|
||
|
||
|
||
def generate_records(
|
||
records: list[dict[str, Any]],
|
||
config: dict[str, Any],
|
||
template_path: Path,
|
||
output_dir: Path,
|
||
progress_cb: Callable[[int, str, str], None] | None = None,
|
||
) -> dict[str, Any]:
|
||
output_cfg = config.get("output_settings", {})
|
||
auto_cleanup = bool(output_cfg.get("auto_cleanup_pptx", True)) if isinstance(output_cfg, dict) else True
|
||
|
||
job_id = f"job_{now_ts()}_{uuid.uuid4().hex[:6]}"
|
||
job_dir = output_dir / job_id
|
||
job_dir.mkdir(parents=True, exist_ok=True)
|
||
register_active_job_dir(job_dir)
|
||
|
||
try:
|
||
generated_items: list[dict[str, Any]] = []
|
||
download_images: list[dict[str, Any]] = []
|
||
tasks: list[dict[str, Any]] = []
|
||
page_template_cache: dict[str, bytes] | None = None
|
||
|
||
used_names: set[str] = set()
|
||
total_records = len(records)
|
||
|
||
generation_strategy = resolve_generation_strategy(config, total_records)
|
||
if generation_strategy == "page_template_cache" and total_records > 0:
|
||
page_names = {str(r.get("page", "")).strip() for r in records if str(r.get("page", "")).strip()}
|
||
if page_names:
|
||
if progress_cb:
|
||
progress_cb(14, "准备模板", "构建单页模板缓存")
|
||
page_template_cache = build_page_template_cache(template_path, page_names)
|
||
if progress_cb:
|
||
progress_cb(18, "准备模板", f"已缓存 {len(page_template_cache)} 个页面")
|
||
|
||
if progress_cb:
|
||
progress_cb(20, "生成PPT", f"0/{total_records}")
|
||
|
||
for i, rec in enumerate(records, start=1):
|
||
base_name = safe_filename(str(rec.get("output_file") or f"喜报_{rec.get('branch', '未知网点')}_{i}.png"))
|
||
if not base_name.lower().endswith(".png"):
|
||
base_name += ".png"
|
||
|
||
final_name = base_name
|
||
seq = 1
|
||
while final_name in used_names:
|
||
final_name = f"{Path(base_name).stem}_{seq}.png"
|
||
seq += 1
|
||
used_names.add(final_name)
|
||
|
||
ppt_name = f"{Path(final_name).stem}.pptx"
|
||
ppt_path = job_dir / ppt_name
|
||
pdf_path = job_dir / f"{Path(final_name).stem}.pdf"
|
||
png_path = job_dir / final_name
|
||
|
||
tasks.append(
|
||
{
|
||
"record": rec,
|
||
"ppt_path": ppt_path,
|
||
"pdf_path": pdf_path,
|
||
"png_path": png_path,
|
||
"page_number": 1,
|
||
"png_name": final_name,
|
||
}
|
||
)
|
||
|
||
def _build_one(task: dict[str, Any]) -> None:
|
||
rec = dict(task.get("record", {}))
|
||
ppt_path = Path(str(task.get("ppt_path")))
|
||
try:
|
||
export_page_number = build_ppt_for_record(
|
||
template_path,
|
||
config,
|
||
rec,
|
||
ppt_path,
|
||
page_template_cache=page_template_cache,
|
||
)
|
||
task["page_number"] = int(export_page_number)
|
||
except Exception as exc:
|
||
append_review_log(
|
||
"generation_error",
|
||
{
|
||
"reason": "build_ppt_failed",
|
||
"source_line": str(rec.get("source_line") or rec.get("raw_text") or ""),
|
||
"record": rec,
|
||
"error": str(exc),
|
||
},
|
||
)
|
||
raise
|
||
|
||
max_workers = resolve_build_workers(config, total_records)
|
||
if total_records <= 1 or max_workers <= 1:
|
||
done = 0
|
||
for task in tasks:
|
||
_build_one(task)
|
||
done += 1
|
||
if progress_cb:
|
||
pct = 20 + int((done / max(total_records, 1)) * 45)
|
||
progress_cb(pct, "生成PPT", f"{done}/{total_records}")
|
||
else:
|
||
done = 0
|
||
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
||
futs = [ex.submit(_build_one, t) for t in tasks]
|
||
for fut in as_completed(futs):
|
||
fut.result()
|
||
done += 1
|
||
if progress_cb:
|
||
pct = 20 + int((done / max(total_records, 1)) * 45)
|
||
progress_cb(pct, "生成PPT", f"{done}/{total_records}")
|
||
|
||
if progress_cb:
|
||
progress_cb(70, "PPT转PDF", f"已转 0/{total_records}")
|
||
convert_pptx_to_pdf_batch(
|
||
job_dir,
|
||
[Path(str(t["ppt_path"])) for t in tasks],
|
||
progress_cb=progress_cb,
|
||
)
|
||
if progress_cb:
|
||
progress_cb(85, "PDF转PNG", "已导出 0/{}".format(total_records))
|
||
extract_pdf_pages_to_png_batch(tasks, config=config, progress_cb=progress_cb)
|
||
|
||
for i, t in enumerate(tasks, start=1):
|
||
rec = dict(t["record"])
|
||
ppt_path = Path(str(t["ppt_path"]))
|
||
pdf_path = Path(str(t["pdf_path"]))
|
||
png_path = Path(str(t["png_path"]))
|
||
|
||
if auto_cleanup:
|
||
ppt_path.unlink(missing_ok=True)
|
||
pdf_path.unlink(missing_ok=True)
|
||
|
||
item = dict(rec)
|
||
item["png_file"] = str(t["png_name"])
|
||
item["job_id"] = job_id
|
||
item["image_path"] = str(png_path)
|
||
token = register_download(png_path, content_type="image/png")
|
||
item["download_token"] = token
|
||
item["download_url"] = f"/api/download/{token}"
|
||
generated_items.append(item)
|
||
download_images.append(
|
||
{
|
||
"name": str(t["png_name"]),
|
||
"download_token": token,
|
||
"download_url": f"/api/download/{token}",
|
||
}
|
||
)
|
||
if progress_cb:
|
||
pct = 95 + int((i / max(total_records, 1)) * 5)
|
||
progress_cb(pct, "整理下载", f"{i}/{total_records}")
|
||
|
||
return {
|
||
"job_id": job_id,
|
||
"job_dir": str(job_dir),
|
||
"generation_strategy": generation_strategy,
|
||
"generated": generated_items,
|
||
"generated_count": len(generated_items),
|
||
"download_images": download_images,
|
||
}
|
||
finally:
|
||
unregister_active_job_dir(job_dir)
|
||
reclaim_runtime_memory(config)
|
||
|
||
|
||
class XibaoHandler(SimpleHTTPRequestHandler):
|
||
def __init__(self, *args: Any, **kwargs: Any):
|
||
super().__init__(*args, directory=str(STATIC_DIR), **kwargs)
|
||
|
||
def _send_json(self, payload: dict[str, Any], status: HTTPStatus = HTTPStatus.OK) -> None:
|
||
data = json.dumps(payload, ensure_ascii=False).encode("utf-8")
|
||
self.send_response(status)
|
||
self.send_header("Content-Type", "application/json; charset=utf-8")
|
||
self.send_header("Content-Length", str(len(data)))
|
||
self.end_headers()
|
||
self.wfile.write(data)
|
||
|
||
def _send_bytes(
|
||
self,
|
||
content: bytes,
|
||
content_type: str,
|
||
filename: str | None = None,
|
||
status: HTTPStatus = HTTPStatus.OK,
|
||
) -> None:
|
||
self.send_response(status)
|
||
self.send_header("Content-Type", content_type)
|
||
self.send_header("Content-Length", str(len(content)))
|
||
if filename:
|
||
encoded = quote(filename)
|
||
self.send_header("Content-Disposition", f"attachment; filename*=UTF-8''{encoded}")
|
||
self.end_headers()
|
||
self.wfile.write(content)
|
||
|
||
def _send_file(
|
||
self,
|
||
file_path: Path,
|
||
content_type: str,
|
||
filename: str | None = None,
|
||
status: HTTPStatus = HTTPStatus.OK,
|
||
max_kbps: int = 0,
|
||
chunk_size: int = 16 * 1024,
|
||
) -> None:
|
||
total_size = int(file_path.stat().st_size)
|
||
self.send_response(status)
|
||
self.send_header("Content-Type", content_type)
|
||
self.send_header("Content-Length", str(total_size))
|
||
if filename:
|
||
encoded = quote(filename)
|
||
self.send_header("Content-Disposition", f"attachment; filename*=UTF-8''{encoded}")
|
||
self.end_headers()
|
||
|
||
max_bytes_per_sec = max(0, int(max_kbps)) * 1024
|
||
block_size = max(4096, int(chunk_size))
|
||
sent = 0
|
||
start_ts = time.perf_counter()
|
||
|
||
with file_path.open("rb") as f:
|
||
while True:
|
||
chunk = f.read(block_size)
|
||
if not chunk:
|
||
break
|
||
self.wfile.write(chunk)
|
||
sent += len(chunk)
|
||
|
||
if max_bytes_per_sec > 0:
|
||
elapsed = time.perf_counter() - start_ts
|
||
expected_elapsed = sent / max_bytes_per_sec
|
||
if expected_elapsed > elapsed:
|
||
time.sleep(min(expected_elapsed - elapsed, 0.2))
|
||
|
||
def _read_json_body(self) -> dict[str, Any]:
|
||
try:
|
||
content_len = int(self.headers.get("Content-Length", "0"))
|
||
except ValueError:
|
||
content_len = 0
|
||
|
||
body = self.rfile.read(content_len) if content_len > 0 else b"{}"
|
||
if not body:
|
||
return {}
|
||
|
||
try:
|
||
return json.loads(body.decode("utf-8"))
|
||
except json.JSONDecodeError:
|
||
raise ValueError("invalid JSON body")
|
||
|
||
def do_GET(self) -> None: # noqa: N802
|
||
parsed = urlparse(self.path)
|
||
|
||
if handle_get_routes(self, parsed, globals()):
|
||
return
|
||
|
||
if parsed.path == "/":
|
||
self.path = "/index.html"
|
||
|
||
super().do_GET()
|
||
|
||
def do_POST(self) -> None: # noqa: N802
|
||
parsed = urlparse(self.path)
|
||
if handle_post_routes(self, parsed, globals()):
|
||
return
|
||
|
||
self._send_json({"ok": False, "error": "not found"}, HTTPStatus.NOT_FOUND)
|
||
|
||
|
||
def run_server(
|
||
host: str = "0.0.0.0",
|
||
port: int = 8787,
|
||
prewarm: bool = True,
|
||
prewarm_blocking: bool = False,
|
||
) -> None:
|
||
if prewarm:
|
||
prewarm_converter_image(background=not prewarm_blocking)
|
||
|
||
cleanup_stop = threading.Event()
|
||
cleanup_thread = threading.Thread(
|
||
target=daily_cleanup_loop,
|
||
args=(cleanup_stop,),
|
||
name="daily-cleanup",
|
||
daemon=True,
|
||
)
|
||
cleanup_thread.start()
|
||
|
||
server = ThreadingHTTPServer((host, port), XibaoHandler)
|
||
print(f"Xibao Web running on http://{host}:{port}")
|
||
try:
|
||
server.serve_forever()
|
||
except KeyboardInterrupt:
|
||
pass
|
||
finally:
|
||
cleanup_stop.set()
|
||
cleanup_thread.join(timeout=1.0)
|
||
server.server_close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
parser = argparse.ArgumentParser(description="Xibao Web server")
|
||
parser.add_argument("--host", default="0.0.0.0", help="bind host, e.g. 0.0.0.0 or 127.0.0.1")
|
||
parser.add_argument("--port", type=int, default=8787, help="bind port")
|
||
parser.add_argument(
|
||
"--skip-prewarm",
|
||
action="store_true",
|
||
help="disable docker converter image prewarm on startup",
|
||
)
|
||
parser.add_argument(
|
||
"--prewarm-blocking",
|
||
action="store_true",
|
||
help="prewarm converter image before serving (blocks startup until ready)",
|
||
)
|
||
args = parser.parse_args()
|
||
run_server(
|
||
host=args.host,
|
||
port=args.port,
|
||
prewarm=not args.skip_prewarm,
|
||
prewarm_blocking=args.prewarm_blocking,
|
||
)
|