"""cache.json I/O. This module owns the on-disk cache contract: where the file lives, what the header looks like, and how mismatches are handled. The contract is the two-tier `cache_schema` + `id_rules` model from docs/CACHE_CONTRACT.md (extension repo). cache_schema on-disk shape. Mismatch -> force rebuild. id_rules integer; bumps when extraction rules change. Mismatch -> mark stale, allow lazy re-extract. id_rules_signature sha256 over canonical rule text (see rcjav.ids.current_rules_signature). Belt-and- braces drift check that catches a forgotten `id_rules` bump. Legacy users on `version: 3` get an in-place header upgrade with no forced rescan; the cache is marked as `id_rules: 0` so it shows up as "stale by rules" until they Re-extract IDs. """ from __future__ import annotations import json import os import sys import time from datetime import datetime from pathlib import Path # Lives next to rc-jav.py at the repo root. CACHE_PATH = Path(__file__).resolve().parents[1] / "cache.json" CACHE_STALE_HOURS = 24 # Two-tier version contract (see docs/CACHE_CONTRACT.md): CACHE_SCHEMA_VERSION = 1 # on-disk shape; bump = force rebuild ID_RULES_VERSION = 1 # extraction rules; bump = mark stale (lazy re-extract) # Legacy alias preserved for any external caller that still imports it. # Maps to CACHE_SCHEMA_VERSION + ID_RULES_VERSION under the new contract. CACHE_VERSION = 3 def _fresh_cache(signature: str = "unknown") -> dict: return { "cache_schema": CACHE_SCHEMA_VERSION, "id_rules": ID_RULES_VERSION, "id_rules_signature": signature, "remotes": {}, } def _migrate_legacy_v3(data: dict) -> dict: """Translate a legacy `version: 3` cache to the new header in place. Sets `id_rules: 0` so the cache reads as "stale by rules" — user sees the new amber state and can opt into a fast Re-extract without a rclone re-scan. """ return { "cache_schema": CACHE_SCHEMA_VERSION, "id_rules": 0, "id_rules_signature": "legacy", "remotes": data.get("remotes", {}), } def load_cache(current_signature: str | None = None) -> dict: """Read and (if necessary) migrate cache.json. `current_signature` is the value of `rcjav.ids.current_rules_signature()` captured by the caller. It's only stamped into the header when this function has to mint a *fresh* cache; when migrating legacy data we deliberately stamp `"legacy"` so the cache reads as stale-by-rules. """ fresh_sig = current_signature or "unknown" if not CACHE_PATH.exists(): return _fresh_cache(fresh_sig) try: data = json.loads(CACHE_PATH.read_text(encoding="utf-8")) except (json.JSONDecodeError, OSError): return _fresh_cache(fresh_sig) if not isinstance(data, dict) or not isinstance(data.get("remotes"), dict): return _fresh_cache(fresh_sig) # Legacy header: { "version": 3, "remotes": {...} } — migrate in place. if "version" in data and "cache_schema" not in data: if data.get("version") == 3: return _migrate_legacy_v3(data) sys.stderr.write( f"[warn] unknown legacy cache version {data.get('version')!r}; " f"rebuilding.\n" ) return _fresh_cache(fresh_sig) # New header: validate schema. Mismatch = force rebuild (per contract). if data.get("cache_schema") != CACHE_SCHEMA_VERSION: sys.stderr.write( f"[warn] cache_schema mismatch (got {data.get('cache_schema')!r}, " f"expected {CACHE_SCHEMA_VERSION}); forcing full rescan.\n" ) return _fresh_cache(fresh_sig) return data def cache_state(cache: dict, current_signature: str) -> str: """Classify a cache dict against the live rule set. Returns one of: "fresh", "stale_by_rules", "schema_mismatch". "schema_mismatch" should normally never reach the caller — load_cache already rebuilds. It's reported for diagnostics flows that read cache.json directly without going through load_cache. """ if cache.get("cache_schema") != CACHE_SCHEMA_VERSION: return "schema_mismatch" rules_match = cache.get("id_rules") == ID_RULES_VERSION sig_match = cache.get("id_rules_signature") == current_signature return "fresh" if (rules_match and sig_match) else "stale_by_rules" def stamp_current_rules(cache: dict, current_signature: str) -> None: """Stamp `id_rules` and `id_rules_signature` to current values in place. Use after a successful re-extract or full scan completes against the live rule set. """ cache["id_rules"] = ID_RULES_VERSION cache["id_rules_signature"] = current_signature def save_cache(cache: dict) -> None: # Write to a sibling tmp file then atomically replace, so a killed mid-write # (Ctrl-C, power loss, concurrent --scan) can't leave a half-written # cache.json — load_cache would otherwise see invalid JSON and fall back to # an empty cache, forcing a full re-scan. tmp = CACHE_PATH.with_suffix(CACHE_PATH.suffix + ".tmp") tmp.write_text(json.dumps(cache, indent=2), encoding="utf-8") try: os.replace(tmp, CACHE_PATH) except PermissionError: # Windows: destination may be briefly locked by antivirus or a concurrent reader. time.sleep(0.5) os.replace(tmp, CACHE_PATH) def cache_age_hours(scanned_at: str) -> float | None: try: dt = datetime.fromisoformat(scanned_at.replace("Z", "+00:00")) except ValueError: return None now = datetime.now(dt.tzinfo) if dt.tzinfo else datetime.now() return (now - dt).total_seconds() / 3600.0 def fmt_age(hours: float) -> str: if hours < 1: return f"{int(hours * 60)}m" if hours < 24: return f"{hours:.1f}h" return f"{hours / 24:.1f}d"