33c495ad57
Implements the two-tier contract from docs/CACHE_CONTRACT.md (extension
repo, locked at step 9):
cache_schema on-disk shape; mismatch -> force rebuild
id_rules bumps when extraction rules change
id_rules_signature sha256 over canonical rule text; catches drift
when the integer bump is forgotten
New constants in rcjav/cache.py:
CACHE_SCHEMA_VERSION = 1
ID_RULES_VERSION = 1 (the legacy "version: 3" cache reads as
id_rules: 0 after in-place migration)
New helpers:
rcjav.ids.current_rules_signature()
Sha256 over the canonical text of every rule that influences
a jav_id: built-in regexes, BUILTIN_PART_RES, PART_RES (which
captures user-added part patterns), FC2 handling.
rcjav.cache.load_cache(signature=None)
Reads cache.json. Legacy `version: 3` headers get an in-place
header upgrade with no forced rescan; the cache is stamped as
`id_rules: 0` + signature "legacy" so it surfaces as
"stale by rules" in cache_state. Schema mismatch on the new
header still forces a rebuild.
rcjav.cache.cache_state(cache, signature)
Classifies a cache as "fresh" / "stale_by_rules" /
"schema_mismatch". Drives the three-state extension UX.
rcjav.cache.stamp_current_rules(cache, signature)
Updates id_rules and id_rules_signature in place. Called after
a successful full scan or --reextract.
New CLI command:
rc-jav.py --reextract
Walks `cache["remotes"][r]["files"]` against the live rule set and
updates `jav_id` in place. No rclone calls — fast path (seconds on
a 7k-file cache). Reports changed/unchanged/dropped per remote.
Stamps current rules into the saved cache.
--scan (full, no --scan-since) now also stamps current rules.
--scan --scan-since deliberately does NOT stamp: it only re-walks
recently-modified files, so older entries may still carry jav_ids
from previous rules; cache stays "stale by rules" until a full scan
or --reextract.
Verified:
- python rc-jav.py --reextract --format json on the live 7124-file
cache → 0 changes (existing IDs already canonical), cache.json
rewritten with new header
- cache_state on the post-migration cache → "fresh"
- tests + fixtures + --help all pass
Extension-side (host's cache_status response + options-cache.js
three-state UX + Re-extract IDs button) ships in a separate commit
in the extension repo.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
165 lines
5.8 KiB
Python
165 lines
5.8 KiB
Python
"""cache.json I/O.
|
|
|
|
This module owns the on-disk cache contract: where the file lives,
|
|
what the header looks like, and how mismatches are handled. The
|
|
contract is the two-tier `cache_schema` + `id_rules` model from
|
|
docs/CACHE_CONTRACT.md (extension repo).
|
|
|
|
cache_schema on-disk shape. Mismatch -> force rebuild.
|
|
id_rules integer; bumps when extraction rules change.
|
|
Mismatch -> mark stale, allow lazy re-extract.
|
|
id_rules_signature sha256 over canonical rule text (see
|
|
rcjav.ids.current_rules_signature). Belt-and-
|
|
braces drift check that catches a forgotten
|
|
`id_rules` bump.
|
|
|
|
Legacy users on `version: 3` get an in-place header upgrade with no
|
|
forced rescan; the cache is marked as `id_rules: 0` so it shows up
|
|
as "stale by rules" until they Re-extract IDs.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
|
|
# Lives next to rc-jav.py at the repo root.
|
|
CACHE_PATH = Path(__file__).resolve().parents[1] / "cache.json"
|
|
CACHE_STALE_HOURS = 24
|
|
|
|
# Two-tier version contract (see docs/CACHE_CONTRACT.md):
|
|
CACHE_SCHEMA_VERSION = 1 # on-disk shape; bump = force rebuild
|
|
ID_RULES_VERSION = 1 # extraction rules; bump = mark stale (lazy re-extract)
|
|
|
|
# Legacy alias preserved for any external caller that still imports it.
|
|
# Maps to CACHE_SCHEMA_VERSION + ID_RULES_VERSION under the new contract.
|
|
CACHE_VERSION = 3
|
|
|
|
|
|
def _fresh_cache(signature: str = "unknown") -> dict:
|
|
return {
|
|
"cache_schema": CACHE_SCHEMA_VERSION,
|
|
"id_rules": ID_RULES_VERSION,
|
|
"id_rules_signature": signature,
|
|
"remotes": {},
|
|
}
|
|
|
|
|
|
def _migrate_legacy_v3(data: dict) -> dict:
|
|
"""Translate a legacy `version: 3` cache to the new header in place.
|
|
|
|
Sets `id_rules: 0` so the cache reads as "stale by rules" — user
|
|
sees the new amber state and can opt into a fast Re-extract without
|
|
a rclone re-scan.
|
|
"""
|
|
return {
|
|
"cache_schema": CACHE_SCHEMA_VERSION,
|
|
"id_rules": 0,
|
|
"id_rules_signature": "legacy",
|
|
"remotes": data.get("remotes", {}),
|
|
}
|
|
|
|
|
|
def load_cache(current_signature: str | None = None) -> dict:
|
|
"""Read and (if necessary) migrate cache.json.
|
|
|
|
`current_signature` is the value of `rcjav.ids.current_rules_signature()`
|
|
captured by the caller. It's only stamped into the header when this
|
|
function has to mint a *fresh* cache; when migrating legacy data we
|
|
deliberately stamp `"legacy"` so the cache reads as stale-by-rules.
|
|
"""
|
|
fresh_sig = current_signature or "unknown"
|
|
|
|
if not CACHE_PATH.exists():
|
|
return _fresh_cache(fresh_sig)
|
|
|
|
try:
|
|
data = json.loads(CACHE_PATH.read_text(encoding="utf-8"))
|
|
except (json.JSONDecodeError, OSError):
|
|
return _fresh_cache(fresh_sig)
|
|
|
|
if not isinstance(data, dict) or not isinstance(data.get("remotes"), dict):
|
|
return _fresh_cache(fresh_sig)
|
|
|
|
# Legacy header: { "version": 3, "remotes": {...} } — migrate in place.
|
|
if "version" in data and "cache_schema" not in data:
|
|
if data.get("version") == 3:
|
|
return _migrate_legacy_v3(data)
|
|
sys.stderr.write(
|
|
f"[warn] unknown legacy cache version {data.get('version')!r}; "
|
|
f"rebuilding.\n"
|
|
)
|
|
return _fresh_cache(fresh_sig)
|
|
|
|
# New header: validate schema. Mismatch = force rebuild (per contract).
|
|
if data.get("cache_schema") != CACHE_SCHEMA_VERSION:
|
|
sys.stderr.write(
|
|
f"[warn] cache_schema mismatch (got {data.get('cache_schema')!r}, "
|
|
f"expected {CACHE_SCHEMA_VERSION}); forcing full rescan.\n"
|
|
)
|
|
return _fresh_cache(fresh_sig)
|
|
|
|
return data
|
|
|
|
|
|
def cache_state(cache: dict, current_signature: str) -> str:
|
|
"""Classify a cache dict against the live rule set.
|
|
|
|
Returns one of: "fresh", "stale_by_rules", "schema_mismatch".
|
|
|
|
"schema_mismatch" should normally never reach the caller — load_cache
|
|
already rebuilds. It's reported for diagnostics flows that read
|
|
cache.json directly without going through load_cache.
|
|
"""
|
|
if cache.get("cache_schema") != CACHE_SCHEMA_VERSION:
|
|
return "schema_mismatch"
|
|
rules_match = cache.get("id_rules") == ID_RULES_VERSION
|
|
sig_match = cache.get("id_rules_signature") == current_signature
|
|
return "fresh" if (rules_match and sig_match) else "stale_by_rules"
|
|
|
|
|
|
def stamp_current_rules(cache: dict, current_signature: str) -> None:
|
|
"""Stamp `id_rules` and `id_rules_signature` to current values in place.
|
|
|
|
Use after a successful re-extract or full scan completes against the
|
|
live rule set.
|
|
"""
|
|
cache["id_rules"] = ID_RULES_VERSION
|
|
cache["id_rules_signature"] = current_signature
|
|
|
|
|
|
def save_cache(cache: dict) -> None:
|
|
# Write to a sibling tmp file then atomically replace, so a killed mid-write
|
|
# (Ctrl-C, power loss, concurrent --scan) can't leave a half-written
|
|
# cache.json — load_cache would otherwise see invalid JSON and fall back to
|
|
# an empty cache, forcing a full re-scan.
|
|
tmp = CACHE_PATH.with_suffix(CACHE_PATH.suffix + ".tmp")
|
|
tmp.write_text(json.dumps(cache, indent=2), encoding="utf-8")
|
|
try:
|
|
os.replace(tmp, CACHE_PATH)
|
|
except PermissionError:
|
|
# Windows: destination may be briefly locked by antivirus or a concurrent reader.
|
|
time.sleep(0.5)
|
|
os.replace(tmp, CACHE_PATH)
|
|
|
|
|
|
def cache_age_hours(scanned_at: str) -> float | None:
|
|
try:
|
|
dt = datetime.fromisoformat(scanned_at.replace("Z", "+00:00"))
|
|
except ValueError:
|
|
return None
|
|
now = datetime.now(dt.tzinfo) if dt.tzinfo else datetime.now()
|
|
return (now - dt).total_seconds() / 3600.0
|
|
|
|
|
|
def fmt_age(hours: float) -> str:
|
|
if hours < 1:
|
|
return f"{int(hours * 60)}m"
|
|
if hours < 24:
|
|
return f"{hours:.1f}h"
|
|
return f"{hours / 24:.1f}d"
|