Sync working tree before initial Gitea push

Includes: - cli.py path fix (parents[1]) for config/catalog resolution - Library cleanup feature design docs (TODO.md, mockup) - Audit + bug-queue markdowns from May 2026 reliability pass - .gitignore expanded for transient artifacts
2026-05-26 22:35:42 +02:00
parent 8d6bdb81af
commit f7fc15b17c
24 changed files with 2938 additions and 41 deletions
@@ -12,7 +12,9 @@ batch of renames.
 """
 from __future__ import annotations

+import re
 import subprocess
+from collections import Counter
 from pathlib import Path

 from rcjav.cache import save_cache
@@ -24,15 +26,118 @@ from rcjav.ids import (
    PRIMARY_ID_RE,
    extract_id,
 )
+from rcjav.output import human_size as _human_size
+
+VIDEO_EXTS = {".avi", ".m4v", ".mkv", ".mov", ".mp4", ".mpeg", ".mpg", ".ts", ".webm", ".wmv"}
+CANONICAL_RESOLUTION_RE = re.compile(r"\[(?P<resolution>\d{3,4}[pi]|4k|8k)\]$", re.IGNORECASE)
+RESOLUTION_COPY_SUFFIX_RE = re.compile(r"\[(?P<resolution>\d{3,4}[pi]|4k|8k)\]\s*\((?P<copy>\d+)\)$", re.IGNORECASE)
+RESOLUTION_PART_SUFFIX_RE = re.compile(
+    r"\[(?P<resolution>\d{3,4}[pi]|4k|8k)\][._ -]*(?P<part>\d+of\d+|part\d+|pt\d+)[.\s]*$",
+    re.IGNORECASE,
+)
+BARE_RESOLUTION_SUFFIX_RE = re.compile(r"(?:^|[._ -])(?P<resolution>\d{3,4}[pi]|4k|8k)$", re.IGNORECASE)
+EMPTY_BRACKETS_RE = re.compile(r"\[\s*\]$")
+BRACKET_TOKEN_SUFFIX_RE = re.compile(r"\[(?P<token>[^\]]+)\]$")
+HD_QUALITY_SUFFIX_RE = re.compile(r"(?:^|[._ -])(?P<quality>hd|fhd|uhd|sd|fullhd)$", re.IGNORECASE)
+MULTIPART_SUFFIX_RE = re.compile(r"(?:[._ -])(?P<part>\d+of\d+|part\d+|pt\d+|cd\d+|disc\d+|[ab])$", re.IGNORECASE)


-def _human_size(n: int) -> str:
-    nf = float(max(0, n))
-    for unit in ("B", "KiB", "MiB", "GiB", "TiB"):
-        if nf < 1024:
-            return f"{int(nf)} B" if unit == "B" else f"{nf:.2f} {unit}"
-        nf /= 1024
-    return f"{nf:.2f} PiB"
+def _issue(kind: str, *, source: str = "builtin", severity: str = "info", **extra) -> dict:
+    return {"kind": kind, "source": source, "severity": severity, **extra}
+
+
+def _compile_custom_filename_rules(config: dict | None) -> list[dict]:
+    rules = ((config or {}).get("filename_hygiene") or {}).get("custom_rules") or []
+    compiled = []
+    for i, rule in enumerate(rules):
+        if not isinstance(rule, dict) or rule.get("enabled", True) is False:
+            continue
+        pattern = rule.get("pattern") or rule.get("match")
+        kind = rule.get("kind") or rule.get("name") or f"custom_rule_{i + 1}"
+        if not pattern:
+            continue
+        try:
+            compiled.append({
+                "name": rule.get("name") or kind,
+                "kind": kind,
+                "severity": rule.get("severity") or "info",
+                "target": rule.get("target") or "filename",
+                "regex": re.compile(pattern, re.IGNORECASE if rule.get("ignore_case", True) else 0),
+            })
+        except re.error:
+            continue
+    return compiled
+
+
+def classify_filename_hygiene(filename: str, config: dict | None = None) -> dict:
+    """Classify filename hygiene without proposing destructive changes."""
+    stem = Path(filename).stem
+    issues: list[dict] = []
+    has_resolution = False
+    resolution_style = "missing"
+
+    if m := CANONICAL_RESOLUTION_RE.search(stem):
+        has_resolution = True
+        resolution_style = "canonical"
+        issues.append(_issue("resolution_canonical", resolution=m.group("resolution").lower()))
+    elif m := RESOLUTION_COPY_SUFFIX_RE.search(stem):
+        has_resolution = True
+        resolution_style = "noncanonical"
+        issues.append(_issue(
+            "resolution_copy_suffix",
+            severity="cleanup",
+            resolution=m.group("resolution").lower(),
+            copy=m.group("copy"),
+        ))
+    elif m := RESOLUTION_PART_SUFFIX_RE.search(stem):
+        has_resolution = True
+        resolution_style = "noncanonical"
+        issues.append(_issue(
+            "resolution_part_suffix",
+            severity="cleanup",
+            resolution=m.group("resolution").lower(),
+            part=m.group("part"),
+        ))
+    elif m := BARE_RESOLUTION_SUFFIX_RE.search(stem):
+        has_resolution = True
+        resolution_style = "noncanonical"
+        issues.append(_issue(
+            "resolution_bare_suffix",
+            severity="cleanup",
+            resolution=m.group("resolution").lower(),
+        ))
+
+    if not has_resolution:
+        issues.append(_issue("missing_resolution", severity="needs_probe"))
+        if EMPTY_BRACKETS_RE.search(stem):
+            issues.append(_issue("resolution_placeholder_empty", severity="needs_probe", token="[]"))
+        elif m := HD_QUALITY_SUFFIX_RE.search(stem):
+            issues.append(_issue("quality_marker_not_resolution", severity="needs_probe", token=m.group("quality")))
+        elif m := BRACKET_TOKEN_SUFFIX_RE.search(stem):
+            issues.append(_issue("suspicious_bracket_token", severity="needs_probe", token=m.group("token")))
+        if m := MULTIPART_SUFFIX_RE.search(stem):
+            issues.append(_issue("multipart_without_resolution", severity="needs_probe", part=m.group("part")))
+
+    for rule in _compile_custom_filename_rules(config):
+        target = rule["target"]
+        value = stem if target == "stem" else filename
+        if target == "path":
+            value = filename
+        match = rule["regex"].search(value)
+        if match:
+            issues.append(_issue(
+                rule["kind"],
+                source="custom",
+                severity=rule["severity"],
+                name=rule["name"],
+                matched=match.group(0),
+            ))
+
+    return {
+        "has_resolution": has_resolution,
+        "resolution_style": resolution_style,
+        "issues": issues,
+    }


 def _bracket_to_canonical(filename: str) -> str:
@@ -61,7 +166,84 @@ def _nohyphen_to_canonical(filename: str) -> str:
    return f"{prefix}-{num_str}{rest}{suffix}"


-def find_library_issues(cache: dict) -> dict:
+def _cache_entry(remote: str, f: dict, issue: str, **extra) -> dict:
+    path = f.get("path", "")
+    filename = Path(path).name
+    ext = Path(filename).suffix.lower()
+    sep = "" if remote.endswith("/") or not path else "/"
+    return {
+        "remote": remote,
+        "path": path,
+        "full_path": f"{remote}{sep}{path}",
+        "filename": filename,
+        "extension": ext,
+        "size": f.get("size", 0),
+        "size_human": _human_size(f.get("size", 0)),
+        "mod_time": f.get("mod_time", ""),
+        "jav_id": f.get("jav_id", ""),
+        "issue": issue,
+        **extra,
+    }
+
+
+def find_missing_resolution(cache: dict, config: dict | None = None) -> dict:
+    """Return cached video files missing a final bracketed [resolution] tag."""
+    items: list[dict] = []
+    by_extension: Counter[str] = Counter()
+    by_remote: Counter[str] = Counter()
+    for remote, remote_data in cache.get("remotes", {}).items():
+        for f in remote_data.get("files", []):
+            fname = Path(f.get("path", "")).name
+            ext = Path(fname).suffix.lower()
+            if ext not in VIDEO_EXTS:
+                continue
+            classification = classify_filename_hygiene(fname, config)
+            if classification["has_resolution"]:
+                continue
+            entry = _cache_entry(remote, f, "missing_resolution", **classification)
+            items.append(entry)
+            by_extension[ext] += 1
+            by_remote[remote] += 1
+    return {
+        "issue": "missing_resolution",
+        "source": "cache",
+        "count": len(items),
+        "by_extension": dict(sorted(by_extension.items())),
+        "by_remote": dict(sorted(by_remote.items())),
+        "items": items,
+    }
+
+
+def find_resolution_noncanonical(cache: dict, config: dict | None = None) -> dict:
+    """Return cached video files with resolution present but not in final [resolution] form."""
+    items: list[dict] = []
+    by_kind: Counter[str] = Counter()
+    by_extension: Counter[str] = Counter()
+    for remote, remote_data in cache.get("remotes", {}).items():
+        for f in remote_data.get("files", []):
+            fname = Path(f.get("path", "")).name
+            ext = Path(fname).suffix.lower()
+            if ext not in VIDEO_EXTS:
+                continue
+            classification = classify_filename_hygiene(fname, config)
+            if classification["resolution_style"] != "noncanonical":
+                continue
+            entry = _cache_entry(remote, f, "resolution_noncanonical", **classification)
+            items.append(entry)
+            by_extension[ext] += 1
+            for issue in classification["issues"]:
+                by_kind[issue["kind"]] += 1
+    return {
+        "issue": "resolution_noncanonical",
+        "source": "cache",
+        "count": len(items),
+        "by_kind": dict(sorted(by_kind.items())),
+        "by_extension": dict(sorted(by_extension.items())),
+        "items": items,
+    }
+
+
+def find_library_issues(cache: dict, config: dict | None = None) -> dict:
    """Scan cache for files with non-canonical names.

    Returns:
@@ -75,31 +257,36 @@ def find_library_issues(cache: dict) -> dict:
            fname = Path(f["path"]).name
            stem = Path(fname).stem
            if stem.startswith("[") and _BRACKET_ID_RE.match(stem):
-                bracket.append({
-                    "remote": remote,
-                    "path": f["path"],
-                    "size": f.get("size", 0),
-                    "size_human": _human_size(f.get("size", 0)),
-                    "mod_time": f.get("mod_time", ""),
-                    "jav_id": f.get("jav_id", ""),
-                    "canonical_name": _bracket_to_canonical(fname),
-                    "issue": "bracket_id",
-                })
+                bracket.append(_cache_entry(
+                    remote, f, "bracket_id",
+                    canonical_name=_bracket_to_canonical(fname),
+                ))
            elif (not PRIMARY_ID_RE.match(stem)
                  and not COMPOUND_ID_RE.match(stem)
                  and not FALLBACK_ID_RE.match(stem)
                  and _NOHYPHEN_ID_RE.match(stem)):
-                nohyphen.append({
-                    "remote": remote,
-                    "path": f["path"],
-                    "size": f.get("size", 0),
-                    "size_human": _human_size(f.get("size", 0)),
-                    "mod_time": f.get("mod_time", ""),
-                    "jav_id": f.get("jav_id", ""),
-                    "canonical_name": _nohyphen_to_canonical(fname),
-                    "issue": "nohyphen_id",
-                })
-    return {"bracket_names": bracket, "nohyphen_names": nohyphen}
+                nohyphen.append(_cache_entry(
+                    remote, f, "nohyphen_id",
+                    canonical_name=_nohyphen_to_canonical(fname),
+                ))
+    missing_resolution = find_missing_resolution(cache, config)
+    resolution_noncanonical = find_resolution_noncanonical(cache, config)
+    return {
+        "bracket_names": bracket,
+        "nohyphen_names": nohyphen,
+        "missing_resolution": missing_resolution["items"],
+        "missing_resolution_summary": {
+            "count": missing_resolution["count"],
+            "by_extension": missing_resolution["by_extension"],
+            "by_remote": missing_resolution["by_remote"],
+        },
+        "resolution_noncanonical": resolution_noncanonical["items"],
+        "resolution_noncanonical_summary": {
+            "count": resolution_noncanonical["count"],
+            "by_kind": resolution_noncanonical["by_kind"],
+            "by_extension": resolution_noncanonical["by_extension"],
+        },
+    }


 def rename_file_in_remote(