diff --git a/rcjav/__init__.py b/rcjav/__init__.py index bd7d98c..7833f0c 100644 --- a/rcjav/__init__.py +++ b/rcjav/__init__.py @@ -82,11 +82,15 @@ from rcjav.dupes import ( # noqa: F401 from rcjav.cache import ( # noqa: F401 CACHE_PATH, CACHE_VERSION, + CACHE_SCHEMA_VERSION, + ID_RULES_VERSION, CACHE_STALE_HOURS, load_cache, save_cache, cache_age_hours, fmt_age, + cache_state, + stamp_current_rules, ) from rcjav.ids import ( # noqa: F401 PRIMARY_ID_RE, @@ -103,4 +107,5 @@ from rcjav.ids import ( # noqa: F401 normalize_id, describe_id_match, expand_range, + current_rules_signature, ) diff --git a/rcjav/cache.py b/rcjav/cache.py index da0c87b..3b959a7 100644 --- a/rcjav/cache.py +++ b/rcjav/cache.py @@ -2,10 +2,20 @@ This module owns the on-disk cache contract: where the file lives, what the header looks like, and how mismatches are handled. The -current shape predates the two-tier `cache_schema` + `id_rules` split -documented in docs/CACHE_CONTRACT.md (extension repo) — step 10j -implements that contract; until then this is the legacy -`version: 3` reader. +contract is the two-tier `cache_schema` + `id_rules` model from +docs/CACHE_CONTRACT.md (extension repo). + + cache_schema on-disk shape. Mismatch -> force rebuild. + id_rules integer; bumps when extraction rules change. + Mismatch -> mark stale, allow lazy re-extract. + id_rules_signature sha256 over canonical rule text (see + rcjav.ids.current_rules_signature). Belt-and- + braces drift check that catches a forgotten + `id_rules` bump. + +Legacy users on `version: 3` get an in-place header upgrade with no +forced rescan; the cache is marked as `id_rules: 0` so it shows up +as "stale by rules" until they Re-extract IDs. """ from __future__ import annotations @@ -19,29 +29,107 @@ from pathlib import Path # Lives next to rc-jav.py at the repo root. CACHE_PATH = Path(__file__).resolve().parents[1] / "cache.json" -CACHE_VERSION = 3 # bumped: extract_id handles bracket-wrapped IDs + no-hyphen fallback CACHE_STALE_HOURS = 24 +# Two-tier version contract (see docs/CACHE_CONTRACT.md): +CACHE_SCHEMA_VERSION = 1 # on-disk shape; bump = force rebuild +ID_RULES_VERSION = 1 # extraction rules; bump = mark stale (lazy re-extract) + +# Legacy alias preserved for any external caller that still imports it. +# Maps to CACHE_SCHEMA_VERSION + ID_RULES_VERSION under the new contract. +CACHE_VERSION = 3 + + +def _fresh_cache(signature: str = "unknown") -> dict: + return { + "cache_schema": CACHE_SCHEMA_VERSION, + "id_rules": ID_RULES_VERSION, + "id_rules_signature": signature, + "remotes": {}, + } + + +def _migrate_legacy_v3(data: dict) -> dict: + """Translate a legacy `version: 3` cache to the new header in place. + + Sets `id_rules: 0` so the cache reads as "stale by rules" — user + sees the new amber state and can opt into a fast Re-extract without + a rclone re-scan. + """ + return { + "cache_schema": CACHE_SCHEMA_VERSION, + "id_rules": 0, + "id_rules_signature": "legacy", + "remotes": data.get("remotes", {}), + } + + +def load_cache(current_signature: str | None = None) -> dict: + """Read and (if necessary) migrate cache.json. + + `current_signature` is the value of `rcjav.ids.current_rules_signature()` + captured by the caller. It's only stamped into the header when this + function has to mint a *fresh* cache; when migrating legacy data we + deliberately stamp `"legacy"` so the cache reads as stale-by-rules. + """ + fresh_sig = current_signature or "unknown" -def load_cache() -> dict: if not CACHE_PATH.exists(): - return {"version": CACHE_VERSION, "remotes": {}} + return _fresh_cache(fresh_sig) + try: data = json.loads(CACHE_PATH.read_text(encoding="utf-8")) - if ( - not isinstance(data, dict) - or data.get("version") != CACHE_VERSION - or not isinstance(data.get("remotes"), dict) - ): - if isinstance(data, dict) and "version" in data and data["version"] != CACHE_VERSION: - sys.stderr.write( - f"[warn] cache version mismatch (got {data['version']}, " - f"expected {CACHE_VERSION}); forcing full rescan.\n" - ) - return {"version": CACHE_VERSION, "remotes": {}} - return data except (json.JSONDecodeError, OSError): - return {"version": CACHE_VERSION, "remotes": {}} + return _fresh_cache(fresh_sig) + + if not isinstance(data, dict) or not isinstance(data.get("remotes"), dict): + return _fresh_cache(fresh_sig) + + # Legacy header: { "version": 3, "remotes": {...} } — migrate in place. + if "version" in data and "cache_schema" not in data: + if data.get("version") == 3: + return _migrate_legacy_v3(data) + sys.stderr.write( + f"[warn] unknown legacy cache version {data.get('version')!r}; " + f"rebuilding.\n" + ) + return _fresh_cache(fresh_sig) + + # New header: validate schema. Mismatch = force rebuild (per contract). + if data.get("cache_schema") != CACHE_SCHEMA_VERSION: + sys.stderr.write( + f"[warn] cache_schema mismatch (got {data.get('cache_schema')!r}, " + f"expected {CACHE_SCHEMA_VERSION}); forcing full rescan.\n" + ) + return _fresh_cache(fresh_sig) + + return data + + +def cache_state(cache: dict, current_signature: str) -> str: + """Classify a cache dict against the live rule set. + + Returns one of: "fresh", "stale_by_rules", "schema_mismatch". + + "schema_mismatch" should normally never reach the caller — load_cache + already rebuilds. It's reported for diagnostics flows that read + cache.json directly without going through load_cache. + """ + if cache.get("cache_schema") != CACHE_SCHEMA_VERSION: + return "schema_mismatch" + rules_match = cache.get("id_rules") == ID_RULES_VERSION + sig_match = cache.get("id_rules_signature") == current_signature + return "fresh" if (rules_match and sig_match) else "stale_by_rules" + + +def stamp_current_rules(cache: dict, current_signature: str) -> None: + """Stamp `id_rules` and `id_rules_signature` to current values in place. + + Use after a successful re-extract or full scan completes against the + live rule set. + """ + cache["id_rules"] = ID_RULES_VERSION + cache["id_rules_signature"] = current_signature def save_cache(cache: dict) -> None: diff --git a/rcjav/cli.py b/rcjav/cli.py index d9a30ed..bb27ba1 100644 --- a/rcjav/cli.py +++ b/rcjav/cli.py @@ -404,6 +404,11 @@ def main(): help="Relative path of the file to rename (within --remote).") ap.add_argument("--new-path", metavar="PATH", help="New relative path after rename (within --remote).") + ap.add_argument("--reextract", action="store_true", + help="Walk cache.json and recompute jav_id on every file entry " + "using the current ID extraction rules. No rclone calls — " + "fast path for picking up rule changes without re-scanning. " + "Outputs JSON when --format json, plain otherwise.") ap.add_argument("--basic", action="store_true", help="Plain text output, no rich tables/panels/progress bars. " "Useful for piping or low-bandwidth terminals.") @@ -480,6 +485,79 @@ def main(): args.catalog = list(DEFAULT_CATALOG) # --library-issues: read-only cache scan for non-canonical filenames. + # --reextract: rebuild jav_id values from current rules without re-scanning. + if args.reextract: + from rcjav.ids import current_rules_signature + from rcjav.cache import stamp_current_rules + sig = current_rules_signature() + cache = load_cache(sig) + changed = 0 + unchanged = 0 + dropped = 0 + per_remote = [] + for remote, entry in (cache.get("remotes") or {}).items(): + r_changed = 0 + r_unchanged = 0 + r_dropped = 0 + files = entry.get("files") or [] + for f in files: + old_id = f.get("jav_id") or "" + new_id = extract_id(Path(f.get("path", "")).name) + if new_id is None: + if old_id: + f["jav_id"] = "" + r_dropped += 1 + continue + if new_id != old_id: + f["jav_id"] = new_id + r_changed += 1 + else: + r_unchanged += 1 + changed += r_changed + unchanged += r_unchanged + dropped += r_dropped + per_remote.append({ + "remote": remote, + "changed": r_changed, + "unchanged": r_unchanged, + "dropped": r_dropped, + "files": len(files), + }) + stamp_current_rules(cache, sig) + save_cache(cache) + summary = { + "ok": True, + "changed": changed, + "unchanged": unchanged, + "dropped": dropped, + "total": changed + unchanged + dropped, + "id_rules_signature": sig, + "remotes": per_remote, + } + if args.format == "json" or BASIC: + print(json.dumps(summary)) + else: + console.print(Panel( + f"[bold]Re-extracted IDs against current rules[/]\n" + f" changed: [yellow]{changed:,}[/]\n" + f" unchanged: [dim]{unchanged:,}[/]\n" + f" dropped: [red]{dropped:,}[/]\n" + f" total: {summary['total']:,}", + title="Re-extract", border_style="green")) + if per_remote: + from rich.table import Table as _Tbl + t = _Tbl(title="Per-remote", show_lines=False) + t.add_column("Remote", style="cyan") + t.add_column("Changed", justify="right", style="yellow") + t.add_column("Unchanged", justify="right", style="dim") + t.add_column("Dropped", justify="right", style="red") + t.add_column("Files", justify="right") + for r in per_remote: + t.add_row(r["remote"], f"{r['changed']:,}", f"{r['unchanged']:,}", + f"{r['dropped']:,}", f"{r['files']:,}") + console.print(t) + sys.exit(0) + if args.library_issues: cache = load_cache() issues = find_library_issues(cache) @@ -546,7 +624,10 @@ def main(): console.print(f"[red]invalid --scan-since value: {args.scan_since!r} " f"(expected e.g. 24h, 7d, 30m, 90s)[/]") sys.exit(2) - cache = load_cache() + from rcjav.ids import current_rules_signature + from rcjav.cache import stamp_current_rules + _scan_sig = current_rules_signature() + cache = load_cache(_scan_sig) cache_meta: dict[str, dict] = {} skipped: list[tuple[str, str]] = [] t0 = time.perf_counter() @@ -566,6 +647,12 @@ def main(): use_cache=not args.no_cache, force_update=True, cache_meta=cache_meta, scan_since=scan_since)) if not args.no_cache: + # Stamp current rules only on a FULL scan. An incremental + # (--scan-since) only re-walked some files; older files in the + # cache may still have jav_ids from the previous rule set, so the + # cache remains "stale by rules" until a full scan or --reextract. + if not scan_since: + stamp_current_rules(cache, _scan_sig) save_cache(cache) elapsed = time.perf_counter() - t0 if BASIC: diff --git a/rcjav/ids.py b/rcjav/ids.py index 7e70437..ad62ce1 100644 --- a/rcjav/ids.py +++ b/rcjav/ids.py @@ -231,6 +231,38 @@ def describe_id_match(display_query: str, matched_query: str, matched_id: str, } +def current_rules_signature() -> str: + """Sha256 over the canonical text of every rule that influences a jav_id. + + Includes built-in regex sources, BUILTIN_PART_RES sources, and PART_RES + (which captures user-added part patterns applied by + `configure_part_patterns`). Output prefixed with `sha256:` so callers can + sniff the algorithm without re-deriving it. + + Stable across invocations: dict is dumped with sort_keys=True. Bumping a + regex changes the digest; reordering BUILTIN_PART_RES also changes it + (order is part of the contract because part-detection short-circuits). + """ + import hashlib + import json as _json + data = { + "schema": 1, # bump when this signature schema itself changes + "primary": PRIMARY_ID_RE.pattern, + "compound": COMPOUND_ID_RE.pattern, + "fallback": FALLBACK_ID_RE.pattern, + "nohyphen": _NOHYPHEN_ID_RE.pattern, + "bracket": _BRACKET_ID_RE.pattern, + "variant": _VARIANT_SUFFIX_RE.pattern, + "xofy": _XOFY_PRIORITY_RE.pattern, + "resolution_tag": _RESOLUTION_TAG_RE.pattern, + "builtin_part_res": [r.pattern for r in BUILTIN_PART_RES], + "part_res": [r.pattern for r in PART_RES], + "fc2_handling": "fc2_to_ppv", + } + text = _json.dumps(data, sort_keys=True, ensure_ascii=False) + return "sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest() + + def expand_range(raw: str) -> list[str] | None: """Expand a bracket range like 'IPZZ-[820-860]' into individual ID strings. Returns None if no range marker present."""