Sync working tree before initial Gitea push

Includes:
- cli.py path fix (parents[1]) for config/catalog resolution
- Library cleanup feature design docs (TODO.md, mockup)
- Audit + bug-queue markdowns from May 2026 reliability pass
- .gitignore expanded for transient artifacts
This commit is contained in:
admin
2026-05-26 22:35:42 +02:00
parent 8d6bdb81af
commit f7fc15b17c
24 changed files with 2938 additions and 41 deletions
+3
View File
@@ -39,7 +39,10 @@ from rcjav.output import ( # noqa: F401
write_json,
)
from rcjav.library import ( # noqa: F401
classify_filename_hygiene,
find_library_issues,
find_missing_resolution,
find_resolution_noncanonical,
rename_file_in_remote,
rename_files_batch,
)
+72 -9
View File
@@ -125,10 +125,10 @@ console = Console() # replaced in main() if --no-color
# Default remotes used when --search is invoked without explicit --source/--target.
DEFAULT_SOURCE = ["cq:personal-files/ClearJAV"]
DEFAULT_TARGET = ["cq:personal-files/JAV/TMP"]
DEFAULT_TARGET = ["cq:JAV"]
# Default WinCatalog export folder (or specific files). Folder entries auto-discover *.csv / *.xml.
DEFAULT_CATALOG: list[str] = [str(Path(__file__).resolve().parent / "wincatalog")]
DEFAULT_CATALOG: list[str] = [str(Path(__file__).resolve().parents[1] / "wincatalog")]
from rcjav.catalog import (
CATALOG_COL_NAME,
@@ -162,13 +162,14 @@ from rcjav.dupes import (
)
from rcjav.library import (
find_library_issues,
find_missing_resolution,
rename_file_in_remote,
rename_files_batch,
_bracket_to_canonical,
_nohyphen_to_canonical,
)
CONFIG_PATH = Path(__file__).resolve().parent / "config.json"
CONFIG_PATH = Path(__file__).resolve().parents[1] / "config.json"
def load_config() -> dict:
if not CONFIG_PATH.exists():
@@ -185,7 +186,14 @@ def load_config() -> dict:
def save_config(cfg: dict) -> None:
tmp = CONFIG_PATH.with_suffix(CONFIG_PATH.suffix + ".tmp")
tmp.write_text(json.dumps(cfg, indent=2), encoding="utf-8")
os.replace(tmp, CONFIG_PATH)
try:
os.replace(tmp, CONFIG_PATH)
except PermissionError:
# Windows: destination may be briefly locked by antivirus or a concurrent reader.
# Mirrors save_cache's retry to avoid asymmetric crash-on---save when
# cache writes would succeed under the same conditions.
time.sleep(0.5)
os.replace(tmp, CONFIG_PATH)
# ---------- collectors ----------
@@ -271,7 +279,6 @@ def cached_collect(remotes: list[str], source_label: str,
if use_cache:
cache["remotes"][r] = {
"scanned_at": datetime.now().astimezone().isoformat(),
"recursive": True,
"files": [{"path": e.path, "size": e.size, "mod_time": e.mod_time,
"jav_id": e.jav_id} for e in fresh],
"skipped": local_skipped,
@@ -325,7 +332,6 @@ def cached_collect(remotes: list[str], source_label: str,
if use_cache:
cache["remotes"][r] = {
"scanned_at": datetime.now().astimezone().isoformat(),
"recursive": True,
"files": merged_files,
"skipped": sorted(old_skipped),
}
@@ -392,6 +398,12 @@ def main():
ap.add_argument("--library-issues", action="store_true",
help="Report non-canonical filenames (bracket-wrapped IDs, no-hyphen IDs). "
"Reads from cache. Outputs JSON when --format json, plain otherwise.")
ap.add_argument("--missing-resolution", action="store_true",
help="Report cached video files whose filename does not end with a bracketed "
"[resolution] tag before the extension. No live rclone calls.")
ap.add_argument("--limit", type=int, default=None,
help="Limit displayed/report items for cache audit modes. Use 0 for all. "
"Human --missing-resolution defaults to 100; JSON defaults to all.")
ap.add_argument("--rename-file", action="store_true",
help="Rename one file in a remote and patch cache. "
"Requires --remote, --old-path, --new-path. Outputs JSON.")
@@ -419,7 +431,7 @@ def main():
ap.add_argument("--clearjav", action="store_true",
help="Shortcut: use DEFAULT_SOURCE as --source and DEFAULT_TARGET as --target, "
"Equivalent to "
"`--source cq:personal-files/ClearJAV --target cq:personal-files/JAV/TMP`.")
"`--source cq:personal-files/ClearJAV --target cq:JAV`.")
args = ap.parse_args()
global console, BASIC, DEFAULT_SOURCE, DEFAULT_TARGET, DEFAULT_CATALOG
@@ -576,13 +588,15 @@ def main():
if args.library_issues:
cache = load_cache()
issues = find_library_issues(cache)
issues = find_library_issues(cache, cfg)
if args.format == "json" or BASIC:
print(json.dumps({"ok": True, **issues}))
else:
bracket = issues["bracket_names"]
nohyphen = issues["nohyphen_names"]
total = len(bracket) + len(nohyphen)
missing = issues.get("missing_resolution", [])
noncanonical = issues.get("resolution_noncanonical", [])
total = len(bracket) + len(nohyphen) + len(missing) + len(noncanonical)
if not total:
console.print(Panel("[bold green]No library issues found.[/]", title="Library Issues"))
else:
@@ -598,6 +612,55 @@ def main():
for e in nohyphen:
t.add_row("no hyphen", Path(e["path"]).name,
e["canonical_name"], e["remote"])
for e in noncanonical:
kinds = ", ".join(i.get("kind", "") for i in e.get("issues", []) if i.get("kind"))
t.add_row("resolution style", Path(e["path"]).name,
kinds or "noncanonical", e["remote"])
for e in missing:
kinds = ", ".join(i.get("kind", "") for i in e.get("issues", []) if i.get("kind"))
t.add_row("missing resolution", Path(e["path"]).name,
kinds or "needs probe", e["remote"])
console.print(t)
sys.exit(0)
if args.missing_resolution:
cache = load_cache()
report = find_missing_resolution(cache, cfg)
limit = args.limit
if limit is None:
limit = None if args.format == "json" else 100
items = report["items"] if limit in (None, 0) else report["items"][:max(0, limit)]
out = {
"ok": True,
**report,
"shown": len(items),
"truncated": len(items) < report["count"],
"items": items,
}
if args.format == "json":
print(json.dumps(out))
elif BASIC:
for item in items:
print(item["full_path"])
if out["truncated"]:
print(f"# Showing {out['shown']} of {out['count']}. Use --limit 0 to show all.", file=sys.stderr)
else:
total = report["count"]
if not total:
console.print(Panel("[bold green]No missing resolution tags found.[/]", title="Missing Resolution"))
else:
from rich.table import Table
by_ext = ", ".join(f"{k}: {v:,}" for k, v in report["by_extension"].items()) or "none"
summary = f"{total:,} file(s) missing final bracketed [resolution] tag\n{by_ext}"
if out["truncated"]:
summary += f"\nShowing first {out['shown']:,}. Use --limit 0 to show all, or --format json for machine output."
console.print(Panel(summary, title="Missing Resolution", border_style="yellow"))
t = Table(title="Cached files", show_lines=False)
t.add_column("Path")
t.add_column("Remote", style="dim")
t.add_column("Size", justify="right")
for e in items:
t.add_row(e["path"], e["remote"], e["size_human"])
console.print(t)
sys.exit(0)
+216 -29
View File
@@ -12,7 +12,9 @@ batch of renames.
"""
from __future__ import annotations
import re
import subprocess
from collections import Counter
from pathlib import Path
from rcjav.cache import save_cache
@@ -24,15 +26,118 @@ from rcjav.ids import (
PRIMARY_ID_RE,
extract_id,
)
from rcjav.output import human_size as _human_size
VIDEO_EXTS = {".avi", ".m4v", ".mkv", ".mov", ".mp4", ".mpeg", ".mpg", ".ts", ".webm", ".wmv"}
CANONICAL_RESOLUTION_RE = re.compile(r"\[(?P<resolution>\d{3,4}[pi]|4k|8k)\]$", re.IGNORECASE)
RESOLUTION_COPY_SUFFIX_RE = re.compile(r"\[(?P<resolution>\d{3,4}[pi]|4k|8k)\]\s*\((?P<copy>\d+)\)$", re.IGNORECASE)
RESOLUTION_PART_SUFFIX_RE = re.compile(
r"\[(?P<resolution>\d{3,4}[pi]|4k|8k)\][._ -]*(?P<part>\d+of\d+|part\d+|pt\d+)[.\s]*$",
re.IGNORECASE,
)
BARE_RESOLUTION_SUFFIX_RE = re.compile(r"(?:^|[._ -])(?P<resolution>\d{3,4}[pi]|4k|8k)$", re.IGNORECASE)
EMPTY_BRACKETS_RE = re.compile(r"\[\s*\]$")
BRACKET_TOKEN_SUFFIX_RE = re.compile(r"\[(?P<token>[^\]]+)\]$")
HD_QUALITY_SUFFIX_RE = re.compile(r"(?:^|[._ -])(?P<quality>hd|fhd|uhd|sd|fullhd)$", re.IGNORECASE)
MULTIPART_SUFFIX_RE = re.compile(r"(?:[._ -])(?P<part>\d+of\d+|part\d+|pt\d+|cd\d+|disc\d+|[ab])$", re.IGNORECASE)
def _human_size(n: int) -> str:
nf = float(max(0, n))
for unit in ("B", "KiB", "MiB", "GiB", "TiB"):
if nf < 1024:
return f"{int(nf)} B" if unit == "B" else f"{nf:.2f} {unit}"
nf /= 1024
return f"{nf:.2f} PiB"
def _issue(kind: str, *, source: str = "builtin", severity: str = "info", **extra) -> dict:
return {"kind": kind, "source": source, "severity": severity, **extra}
def _compile_custom_filename_rules(config: dict | None) -> list[dict]:
rules = ((config or {}).get("filename_hygiene") or {}).get("custom_rules") or []
compiled = []
for i, rule in enumerate(rules):
if not isinstance(rule, dict) or rule.get("enabled", True) is False:
continue
pattern = rule.get("pattern") or rule.get("match")
kind = rule.get("kind") or rule.get("name") or f"custom_rule_{i + 1}"
if not pattern:
continue
try:
compiled.append({
"name": rule.get("name") or kind,
"kind": kind,
"severity": rule.get("severity") or "info",
"target": rule.get("target") or "filename",
"regex": re.compile(pattern, re.IGNORECASE if rule.get("ignore_case", True) else 0),
})
except re.error:
continue
return compiled
def classify_filename_hygiene(filename: str, config: dict | None = None) -> dict:
"""Classify filename hygiene without proposing destructive changes."""
stem = Path(filename).stem
issues: list[dict] = []
has_resolution = False
resolution_style = "missing"
if m := CANONICAL_RESOLUTION_RE.search(stem):
has_resolution = True
resolution_style = "canonical"
issues.append(_issue("resolution_canonical", resolution=m.group("resolution").lower()))
elif m := RESOLUTION_COPY_SUFFIX_RE.search(stem):
has_resolution = True
resolution_style = "noncanonical"
issues.append(_issue(
"resolution_copy_suffix",
severity="cleanup",
resolution=m.group("resolution").lower(),
copy=m.group("copy"),
))
elif m := RESOLUTION_PART_SUFFIX_RE.search(stem):
has_resolution = True
resolution_style = "noncanonical"
issues.append(_issue(
"resolution_part_suffix",
severity="cleanup",
resolution=m.group("resolution").lower(),
part=m.group("part"),
))
elif m := BARE_RESOLUTION_SUFFIX_RE.search(stem):
has_resolution = True
resolution_style = "noncanonical"
issues.append(_issue(
"resolution_bare_suffix",
severity="cleanup",
resolution=m.group("resolution").lower(),
))
if not has_resolution:
issues.append(_issue("missing_resolution", severity="needs_probe"))
if EMPTY_BRACKETS_RE.search(stem):
issues.append(_issue("resolution_placeholder_empty", severity="needs_probe", token="[]"))
elif m := HD_QUALITY_SUFFIX_RE.search(stem):
issues.append(_issue("quality_marker_not_resolution", severity="needs_probe", token=m.group("quality")))
elif m := BRACKET_TOKEN_SUFFIX_RE.search(stem):
issues.append(_issue("suspicious_bracket_token", severity="needs_probe", token=m.group("token")))
if m := MULTIPART_SUFFIX_RE.search(stem):
issues.append(_issue("multipart_without_resolution", severity="needs_probe", part=m.group("part")))
for rule in _compile_custom_filename_rules(config):
target = rule["target"]
value = stem if target == "stem" else filename
if target == "path":
value = filename
match = rule["regex"].search(value)
if match:
issues.append(_issue(
rule["kind"],
source="custom",
severity=rule["severity"],
name=rule["name"],
matched=match.group(0),
))
return {
"has_resolution": has_resolution,
"resolution_style": resolution_style,
"issues": issues,
}
def _bracket_to_canonical(filename: str) -> str:
@@ -61,7 +166,84 @@ def _nohyphen_to_canonical(filename: str) -> str:
return f"{prefix}-{num_str}{rest}{suffix}"
def find_library_issues(cache: dict) -> dict:
def _cache_entry(remote: str, f: dict, issue: str, **extra) -> dict:
path = f.get("path", "")
filename = Path(path).name
ext = Path(filename).suffix.lower()
sep = "" if remote.endswith("/") or not path else "/"
return {
"remote": remote,
"path": path,
"full_path": f"{remote}{sep}{path}",
"filename": filename,
"extension": ext,
"size": f.get("size", 0),
"size_human": _human_size(f.get("size", 0)),
"mod_time": f.get("mod_time", ""),
"jav_id": f.get("jav_id", ""),
"issue": issue,
**extra,
}
def find_missing_resolution(cache: dict, config: dict | None = None) -> dict:
"""Return cached video files missing a final bracketed [resolution] tag."""
items: list[dict] = []
by_extension: Counter[str] = Counter()
by_remote: Counter[str] = Counter()
for remote, remote_data in cache.get("remotes", {}).items():
for f in remote_data.get("files", []):
fname = Path(f.get("path", "")).name
ext = Path(fname).suffix.lower()
if ext not in VIDEO_EXTS:
continue
classification = classify_filename_hygiene(fname, config)
if classification["has_resolution"]:
continue
entry = _cache_entry(remote, f, "missing_resolution", **classification)
items.append(entry)
by_extension[ext] += 1
by_remote[remote] += 1
return {
"issue": "missing_resolution",
"source": "cache",
"count": len(items),
"by_extension": dict(sorted(by_extension.items())),
"by_remote": dict(sorted(by_remote.items())),
"items": items,
}
def find_resolution_noncanonical(cache: dict, config: dict | None = None) -> dict:
"""Return cached video files with resolution present but not in final [resolution] form."""
items: list[dict] = []
by_kind: Counter[str] = Counter()
by_extension: Counter[str] = Counter()
for remote, remote_data in cache.get("remotes", {}).items():
for f in remote_data.get("files", []):
fname = Path(f.get("path", "")).name
ext = Path(fname).suffix.lower()
if ext not in VIDEO_EXTS:
continue
classification = classify_filename_hygiene(fname, config)
if classification["resolution_style"] != "noncanonical":
continue
entry = _cache_entry(remote, f, "resolution_noncanonical", **classification)
items.append(entry)
by_extension[ext] += 1
for issue in classification["issues"]:
by_kind[issue["kind"]] += 1
return {
"issue": "resolution_noncanonical",
"source": "cache",
"count": len(items),
"by_kind": dict(sorted(by_kind.items())),
"by_extension": dict(sorted(by_extension.items())),
"items": items,
}
def find_library_issues(cache: dict, config: dict | None = None) -> dict:
"""Scan cache for files with non-canonical names.
Returns:
@@ -75,31 +257,36 @@ def find_library_issues(cache: dict) -> dict:
fname = Path(f["path"]).name
stem = Path(fname).stem
if stem.startswith("[") and _BRACKET_ID_RE.match(stem):
bracket.append({
"remote": remote,
"path": f["path"],
"size": f.get("size", 0),
"size_human": _human_size(f.get("size", 0)),
"mod_time": f.get("mod_time", ""),
"jav_id": f.get("jav_id", ""),
"canonical_name": _bracket_to_canonical(fname),
"issue": "bracket_id",
})
bracket.append(_cache_entry(
remote, f, "bracket_id",
canonical_name=_bracket_to_canonical(fname),
))
elif (not PRIMARY_ID_RE.match(stem)
and not COMPOUND_ID_RE.match(stem)
and not FALLBACK_ID_RE.match(stem)
and _NOHYPHEN_ID_RE.match(stem)):
nohyphen.append({
"remote": remote,
"path": f["path"],
"size": f.get("size", 0),
"size_human": _human_size(f.get("size", 0)),
"mod_time": f.get("mod_time", ""),
"jav_id": f.get("jav_id", ""),
"canonical_name": _nohyphen_to_canonical(fname),
"issue": "nohyphen_id",
})
return {"bracket_names": bracket, "nohyphen_names": nohyphen}
nohyphen.append(_cache_entry(
remote, f, "nohyphen_id",
canonical_name=_nohyphen_to_canonical(fname),
))
missing_resolution = find_missing_resolution(cache, config)
resolution_noncanonical = find_resolution_noncanonical(cache, config)
return {
"bracket_names": bracket,
"nohyphen_names": nohyphen,
"missing_resolution": missing_resolution["items"],
"missing_resolution_summary": {
"count": missing_resolution["count"],
"by_extension": missing_resolution["by_extension"],
"by_remote": missing_resolution["by_remote"],
},
"resolution_noncanonical": resolution_noncanonical["items"],
"resolution_noncanonical_summary": {
"count": resolution_noncanonical["count"],
"by_kind": resolution_noncanonical["by_kind"],
"by_extension": resolution_noncanonical["by_extension"],
},
}
def rename_file_in_remote(
+19 -1
View File
@@ -12,6 +12,7 @@ import re
import subprocess
import sys
import threading
import time
from pathlib import Path
from rcjav.ids import RANGE_RE, expand_range, extract_id, normalize_id
@@ -24,7 +25,10 @@ RCLONE_BIN = "rclone"
# extension popup. walk_remote checks for it every CANCEL_CHECK_INTERVAL files
# and exits cleanly if found.
CANCEL_FLAG = Path(__file__).resolve().parents[1] / "scan-cancel.flag"
CANCEL_CHECK_INTERVAL = 100 # check / emit progress every N files
CANCEL_CHECK_INTERVAL = 25
PROGRESS_EMIT_MIN_FILES = 25
PROGRESS_EMIT_MIN_GAP_S = 0.25
PROGRESS_EMIT_MAX_GAP_S = 1.0
# Toggled from rc-jav.py main() when --basic is passed. Affects whether
# walk_remote emits machine-parseable progress lines on stderr.
@@ -234,6 +238,8 @@ def walk_remote(remote: str, source_label: str,
)
_stderr_thread.start()
_cancelled = False
last_emit_n = 0
last_emit_ts = time.monotonic()
try:
for line in proc.stdout:
line = line.rstrip("\n").rstrip("\r")
@@ -272,12 +278,24 @@ def walk_remote(remote: str, source_label: str,
proc.kill()
_cancelled = True
break
if BASIC and n > 0:
now = time.monotonic()
files_since_emit = n - last_emit_n
elapsed_since_emit = now - last_emit_ts
should_emit_progress = (
files_since_emit >= PROGRESS_EMIT_MIN_FILES
and elapsed_since_emit >= PROGRESS_EMIT_MIN_GAP_S
) or elapsed_since_emit >= PROGRESS_EMIT_MAX_GAP_S
if not should_emit_progress:
continue
sys.stderr.write("SCAN_FILE_PROGRESS " + json.dumps({
"remote": remote, "label": source_label,
"files": len(entries), "skipped": len(local_skipped),
"total": total,
}) + "\n")
sys.stderr.flush()
last_emit_n = n
last_emit_ts = now
except KeyboardInterrupt:
proc.terminate()
try: