From 1cc2c38128d3f91548b4d2d03966a46c8f72ed9d Mon Sep 17 00:00:00 2001 From: admin Date: Fri, 22 May 2026 22:01:52 +0200 Subject: [PATCH] Step 10i: rc-jav.py becomes a thin shim; main() lives in rcjav/cli.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The real entrypoint moved into rcjav/cli.py (845 lines: imports + the remaining top-level glue + collectors + main()). rc-jav.py is now a 25-line shim that does: - `from rcjav import *` to re-export the package surface for callers that load this script via importlib.spec_from_file_location (tests/test_rules.py, fixtures/run.py, the native-messaging host via importlib). - `from rcjav.cli import main` and call it under `__main__`. Verified all four entry points: - python rc-jav.py --help → ok (legacy CLI invocation) - python -m rcjav.cli --help → ok (package-direct) - python fixtures/run.py → 17/17 cases pass - python -m unittest tests.test_rules → 5/5 OK Co-Authored-By: Claude Opus 4.7 --- rc-jav.py | 846 +-------------------------------------------------- rcjav/cli.py | 845 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 858 insertions(+), 833 deletions(-) create mode 100644 rcjav/cli.py diff --git a/rc-jav.py b/rc-jav.py index d9a30ed..8add39d 100644 --- a/rc-jav.py +++ b/rc-jav.py @@ -1,845 +1,25 @@ #!/usr/bin/env python3 -"""Scan rclone remotes for duplicate JAV files grouped by ID.""" +"""Scan rclone remotes for duplicate JAV files grouped by ID. + +This file is a thin shim. All implementation lives in the `rcjav` +package; `rcjav.cli.main` is the real entrypoint. Names that +external consumers (tests, fixtures runner, native-messaging host) +historically imported from this script remain available via the +wildcard re-export from `rcjav/__init__.py`. +""" from __future__ import annotations -import argparse -import csv -import fnmatch -import json -import os -import re -import subprocess import sys -import threading -import time -import xml.etree.ElementTree as ET -from dataclasses import dataclass, asdict -from datetime import datetime -from pathlib import Path -from typing import Iterable -from rich.console import Console -from rich.panel import Panel -from rich.progress import ( - BarColumn, - MofNCompleteColumn, - Progress, - SpinnerColumn, - TextColumn, - TimeElapsedColumn, - TimeRemainingColumn, -) -from rich.table import Table -from rich.text import Text - -from rcjav.model import FileEntry -from rcjav import ids as _rcjav_ids -from rcjav.ids import ( - PRIMARY_ID_RE, - FALLBACK_ID_RE, - COMPOUND_ID_RE, - RANGE_RE, - BUILTIN_PART_RES, - configure_part_patterns, - detect_part, - detect_part_from_stem, - part_key, - extract_id, - normalize_id, - describe_id_match, - expand_range, - _VARIANT_SUFFIX_RE, - _RES_LABEL_RE, - _RESOLUTION_TAG_RE, - _BRACKET_ID_RE, - _NOHYPHEN_ID_RE, - _VIDEO_EXTS, - _LOWEST_KEEP_PRIORITY_EXTS, -) - - -# PART_RES is rebound by configure_part_patterns(); always read it dynamically -# from the rcjav.ids module rather than capturing a stale binding at import time. -def _current_part_res(): - return _rcjav_ids.PART_RES - - -from rcjav.rclone_io import ( - RCLONE_BIN, - DURATION_RE, - set_basic as _set_rclone_basic, - set_rclone_bin as _set_rclone_bin, - quick_search_remote, - choose_search_mode, - name_to_include_patterns, - name_match, - query_to_include_patterns, - remote_file_count, - parse_duration, - walk_remote, -) -from rcjav import output as _output -from rcjav.output import ( - human_size, - ansi, - ANSI_RESET, - ANSI_GREEN, - ANSI_RED, - ANSI_YELLOW, - ANSI_CYAN, - ANSI_DIM, - ANSI_BOLD, - strip_markup, - BasicProgress, - make_progress, - render_banner, - render_search, - render_name_matches, - render_name_matches_plain, - render_dupes, - render_banner_plain, - render_search_plain, - render_dupes_plain, - write_txt, - write_csv, - write_json, - describe_skipped_id, - dupes_to_obj, - set_use_ansi as _set_output_use_ansi, - set_basic as _set_output_basic, - set_console_no_color as _set_output_no_color, -) - -# rc-jav.py keeps its own local rich Console for the prints that haven't -# moved to rcjav.output yet (collectors, main()). When --no-color is in -# play we rebind both this and rcjav.output's console. -console = Console() - - -# Mirror of rcjav.rclone_io.BASIC for in-tree readers that haven't been -# updated yet (output renderers, BasicProgress checks in main()). Set in -# main() via both this name and _set_rclone_basic(). -BASIC = False # set by --basic -console = Console() # replaced in main() if --no-color - - -# Default remotes used when --search is invoked without explicit --source/--target. -DEFAULT_SOURCE = ["cq:personal-files/ClearJAV"] -DEFAULT_TARGET = ["cq:personal-files/JAV/TMP"] - -# Default WinCatalog export folder (or specific files). Folder entries auto-discover *.csv / *.xml. -DEFAULT_CATALOG: list[str] = [str(Path(__file__).resolve().parent / "wincatalog")] - -from rcjav.catalog import ( - CATALOG_COL_NAME, - CATALOG_COL_PATH, - CATALOG_COL_SIZE, - CATALOG_COL_DISC, - normalize_catalog_path, - load_catalog_csv, - load_catalog_xml, - load_catalogs, - _expand_catalog_paths, -) -from rcjav.cache import ( - CACHE_PATH, - CACHE_VERSION, - CACHE_STALE_HOURS, - load_cache, - save_cache, - cache_age_hours, - fmt_age, -) - -from rcjav.dupes import ( - DEFAULT_KEEP_RANKING, - set_keep_ranking, - decide_keep_with_reason, - decide_keep, - find_dupes, - describe_dupe_risks, - find_variant_alerts, -) -from rcjav.library import ( - find_library_issues, - rename_file_in_remote, - rename_files_batch, - _bracket_to_canonical, - _nohyphen_to_canonical, -) - -CONFIG_PATH = Path(__file__).resolve().parent / "config.json" - -def load_config() -> dict: - if not CONFIG_PATH.exists(): - return {} - try: - data = json.loads(CONFIG_PATH.read_text(encoding="utf-8")) - if not isinstance(data, dict): - return {} - return data - except (json.JSONDecodeError, OSError): - return {} - - -def save_config(cfg: dict) -> None: - tmp = CONFIG_PATH.with_suffix(CONFIG_PATH.suffix + ".tmp") - tmp.write_text(json.dumps(cfg, indent=2), encoding="utf-8") - os.replace(tmp, CONFIG_PATH) - - -# ---------- collectors ---------- - -def collect_with_progress(remotes_by_label: list[tuple[str, str]], - skipped: list[tuple[str, str]] - ) -> list[FileEntry]: - """Dupe-mode collect — every remote freshly walked with progress.""" - out: list[FileEntry] = [] - if not remotes_by_label: - return out - with make_progress() as progress: - tasks = {(label, r): progress.add_task(f"{label} {r}", total=1) - for label, r in remotes_by_label} - for (label, r), tid in tasks.items(): - entries, _ = walk_remote(r, label, skipped, progress, tid) - out.extend(entries) - return out - - -def cached_collect(remotes: list[str], source_label: str, - skipped: list[tuple[str, str]], - cache: dict, use_cache: bool, force_update: bool, - cache_meta: dict[str, dict], - scan_since: str | None = None) -> list[FileEntry]: - """Search-mode collect with cache. Always recursive. - scan_since: rclone duration string (`24h`, `7d`). When set during a forced - update, only files modified within the window are walked and merged on top - of the existing cache entry; files older than the window keep their cached - record. If there's no prior cache entry for a remote, falls through to a - full scan.""" - out: list[FileEntry] = [] - to_scan: list[str] = [] - to_incremental: list[tuple[str, dict]] = [] # (remote, existing_entry) - for r in remotes: - if scan_since and force_update and use_cache: - existing = cache["remotes"].get(r) - if existing: - to_incremental.append((r, existing)) - continue - # No prior cache for this remote -> can't be incremental, fall back. - entry = cache["remotes"].get(r) if use_cache and not force_update else None - if entry: - age = cache_age_hours(entry["scanned_at"]) - age_str = fmt_age(age) if age is not None else "?" - stale = age is not None and age > CACHE_STALE_HOURS - cache_meta[r] = {"cached": True, "age": age_str, "stale": stale, - "file_count": len(entry["files"])} - for f in entry["files"]: - out.append(FileEntry(source=source_label, remote=r, path=f["path"], - size=f["size"], mod_time=f.get("mod_time", ""), - jav_id=f["jav_id"])) - for s in entry.get("skipped", []): - skipped.append((r, s)) - else: - to_scan.append(r) - - if to_scan: - with make_progress() as progress: - tids = {r: progress.add_task(f"{source_label} {r}", total=1) for r in to_scan} - for r_idx, r in enumerate(to_scan): - _total: int | None = None - if BASIC: - # Emit SCAN_REMOTE_START immediately so the UI shows the remote name. - # Then probe the file count; once known, emit SCAN_REMOTE_COUNTED so - # the UI can show "N / total" without waiting for the first 100 files. - sys.stderr.write("SCAN_REMOTE_START " + json.dumps({ - "remote": r, "label": source_label, - "index": r_idx + 1, "of": len(to_scan), - "total": None, - }) + "\n") - sys.stderr.flush() - _total = remote_file_count(r) - sys.stderr.write("SCAN_REMOTE_COUNTED " + json.dumps({ - "remote": r, "total": _total, - }) + "\n") - sys.stderr.flush() - fresh, local_skipped = walk_remote(r, source_label, skipped, progress, tids[r], - _total_override=_total) - out.extend(fresh) - cache_meta[r] = {"cached": False, "age": "fresh", "stale": False, - "file_count": len(fresh)} - if use_cache: - cache["remotes"][r] = { - "scanned_at": datetime.now().astimezone().isoformat(), - "recursive": True, - "files": [{"path": e.path, "size": e.size, "mod_time": e.mod_time, - "jav_id": e.jav_id} for e in fresh], - "skipped": local_skipped, - } - if BASIC: - sys.stderr.write("SCAN_PROGRESS " + json.dumps({ - "remote": r, "label": source_label, - "files": len(fresh), "files_total": len(out), - }) + "\n") - sys.stderr.flush() - - if to_incremental: - with make_progress() as progress: - tids = {r: progress.add_task(f"{source_label} {r} (since {scan_since})", total=1) - for r, _ in to_incremental} - for r_idx, (r, existing) in enumerate(to_incremental): - if BASIC: - sys.stderr.write("SCAN_REMOTE_START " + json.dumps({ - "remote": r, "label": source_label, - "index": r_idx + 1, "of": len(to_incremental), - "total": None, "incremental": True, - }) + "\n") - sys.stderr.flush() - fresh, local_skipped = walk_remote( - r, source_label, skipped, progress, tids[r], max_age=scan_since, - ) - # Merge: replace entries at paths we just walked, keep all others. - new_paths = {e.path for e in fresh} - old_files = [f for f in existing.get("files", []) - if f["path"] not in new_paths] - merged_files = old_files + [ - {"path": e.path, "size": e.size, "mod_time": e.mod_time, - "jav_id": e.jav_id} for e in fresh - ] - # Merge skipped lists (de-dupe). - old_skipped = set(existing.get("skipped", [])) - old_skipped.update(local_skipped) - # Emit FileEntry for everything (old + new) so the caller sees the - # full set, not just deltas. - for f in merged_files: - out.append(FileEntry(source=source_label, remote=r, path=f["path"], - size=f["size"], mod_time=f.get("mod_time", ""), - jav_id=f["jav_id"])) - for s in old_skipped: - skipped.append((r, s)) - cache_meta[r] = { - "cached": False, "age": f"incremental {scan_since}", - "stale": False, "file_count": len(merged_files), - "added_or_updated": len(fresh), - } - if use_cache: - cache["remotes"][r] = { - "scanned_at": datetime.now().astimezone().isoformat(), - "recursive": True, - "files": merged_files, - "skipped": sorted(old_skipped), - } - if BASIC: - sys.stderr.write("SCAN_PROGRESS " + json.dumps({ - "remote": r, "label": source_label, - "files": len(fresh), "files_total": len(out), - "incremental": True, - "file_count": len(merged_files), - }) + "\n") - sys.stderr.flush() - return out - - -# ---------- main ---------- - -def main(): - ap = argparse.ArgumentParser(description="Report duplicate JAV files across rclone remotes (read-only).") - ap.add_argument("--source", "-s", action="append", default=[], metavar="REMOTE", - help="Source remote path (priority — wins dupes regardless of size). Repeatable.") - ap.add_argument("--target", "-t", action="append", default=[], metavar="REMOTE", - help="Target remote path (non-priority — largest size wins among targets). Repeatable.") - ap.add_argument("--format", choices=["console", "txt", "csv", "json", "all"], - default="console") - ap.add_argument("--output-dir", default="./reports", help="Where to write txt/csv/json.") - ap.add_argument("--no-color", action="store_true") - ap.add_argument("--rclone-bin", default="rclone", - help="Path to rclone executable (default: 'rclone' on PATH).") - ap.add_argument("--search", action="append", default=[], metavar="ID", - help="Search mode: look up a JAV ID (e.g. SSIS-001). Repeatable. " - "If no --source/--target given, default target is used.") - ap.add_argument("--name", action="append", default=[], metavar="STR", - help="Substring/glob search against filename. Case-insensitive. " - "Repeatable; OR semantics (any token match = hit). " - "Supports * and ? wildcards. Use quotes for spaces.") - ap.add_argument("--update", "-u", action="store_true", - help="Search mode: force re-scan and overwrite cache for requested remotes.") - ap.add_argument("--no-cache", action="store_true", - help="Search mode: bypass cache entirely (no read, no write).") - ap.add_argument("--quick", "-q", action="store_true", - help="Force quick mode: skip cache, query rclone directly with --include glob. " - "Default is auto: single exact IDs use quick, wildcards/ranges/multi use cached.") - ap.add_argument("--cache", action="store_true", - help="Force cached mode (opposite of --quick).") - ap.add_argument("--save", action="store_true", - help="Persist the --source / --target / --catalog values you passed " - "as new defaults in config.json next to the script. " - "Only keys you explicitly passed are saved.") - ap.add_argument("--scan", action="store_true", - help="Walk configured remotes, refresh cache, exit. No search/dupe output. " - "Default scope: DEFAULT_TARGET. Override with --source/--target. " - "Always overwrites cache. Suitable for Task Scheduler / cron.") - ap.add_argument("--scan-since", metavar="DURATION", - help="Incremental scan: only walk files modified within DURATION " - "(e.g. 24h, 7d, 30m, 90s). Merges new/changed entries on top of " - "the existing cache; old entries are preserved. Falls back to a " - "full scan if there's no prior cache for a remote. Requires --scan.") - ap.add_argument("--catalog", action="append", default=[], metavar="PATH", - help="Path to a WinCatalog CSV or XML export. Repeatable. " - "Listed under 'Catalog' in results (informational, never KEEP/DELETE?).") - ap.add_argument("--part-pattern", action="append", default=[], metavar="REGEX", - help="Extra multipart filename regex. Repeatable; first capture group must be the part number. " - "Patterns run against the filename stem after built-in part detectors.") - ap.add_argument("--library-issues", action="store_true", - help="Report non-canonical filenames (bracket-wrapped IDs, no-hyphen IDs). " - "Reads from cache. Outputs JSON when --format json, plain otherwise.") - ap.add_argument("--rename-file", action="store_true", - help="Rename one file in a remote and patch cache. " - "Requires --remote, --old-path, --new-path. Outputs JSON.") - ap.add_argument("--rename-files-batch", action="store_true", - help="Rename multiple files in one call, writing cache once. " - "Reads JSON array of {remote, old_path, new_path} from stdin. Outputs JSON.") - ap.add_argument("--remote", metavar="REMOTE", - help="Remote path root for --rename-file (e.g. cq:JAV).") - ap.add_argument("--old-path", metavar="PATH", - help="Relative path of the file to rename (within --remote).") - ap.add_argument("--new-path", metavar="PATH", - help="New relative path after rename (within --remote).") - ap.add_argument("--basic", action="store_true", - help="Plain text output, no rich tables/panels/progress bars. " - "Useful for piping or low-bandwidth terminals.") - ap.add_argument("--clearjav", action="store_true", - help="Shortcut: use DEFAULT_SOURCE as --source and DEFAULT_TARGET as --target, " - "Equivalent to " - "`--source cq:personal-files/ClearJAV --target cq:personal-files/JAV/TMP`.") - args = ap.parse_args() - - global console, BASIC, DEFAULT_SOURCE, DEFAULT_TARGET, DEFAULT_CATALOG - _set_rclone_bin(args.rclone_bin) - BASIC = args.basic or args.format == "json" - _set_output_basic(BASIC) - - # Apply persisted config overrides BEFORE defaults are consulted. - cfg = load_config() - if "default_source" in cfg: - DEFAULT_SOURCE = list(cfg["default_source"]) - if "default_target" in cfg: - DEFAULT_TARGET = list(cfg["default_target"]) - if "default_catalog" in cfg: - DEFAULT_CATALOG = list(cfg["default_catalog"]) - set_keep_ranking(cfg.get("keep_ranking") or {}) - part_patterns = list(cfg.get("part_patterns") or []) + list(args.part_pattern) - pattern_errors = configure_part_patterns(part_patterns) - if pattern_errors: - for err in pattern_errors: - console.print(f"[red]invalid part pattern:[/] {err}") - sys.exit(2) - - # --save: persist explicitly-passed values, exit. - if args.save: - if not (args.source or args.target or args.catalog or args.part_pattern): - console.print("[red]--save needs at least one --source/--target/--catalog/--part-pattern value to persist.[/]") - sys.exit(2) - new_cfg = dict(cfg) - if args.source: - new_cfg["default_source"] = list(args.source) - if args.target: - new_cfg["default_target"] = list(args.target) - if args.catalog: - new_cfg["default_catalog"] = list(args.catalog) - if args.part_pattern: - new_cfg["part_patterns"] = list(args.part_pattern) - save_config(new_cfg) - console.print(f"[green]Saved to {CONFIG_PATH}:[/]") - for k in ("default_source", "default_target", "default_catalog", "part_patterns"): - if k in new_cfg: - console.print(f" {k} = {new_cfg[k]}") - sys.exit(0) - _set_output_use_ansi(not args.no_color) - if args.no_color or BASIC: - console = Console(no_color=True, color_system=None, highlight=False) - _set_output_no_color() - - # Search mode: defaults kick in if no remotes specified. - if args.clearjav: - if not args.source: - args.source = list(DEFAULT_SOURCE) - if not args.target: - args.target = list(DEFAULT_TARGET) - - if args.search and not args.source and not args.target: - args.target = list(DEFAULT_TARGET) - - # --scan: default to DEFAULT_TARGET only, always overwrite cache. - if args.scan: - if not args.source and not args.target: - args.target = list(DEFAULT_TARGET) - args.update = True - - # Use default catalog(s) if user passed none. - if not args.catalog and DEFAULT_CATALOG: - args.catalog = list(DEFAULT_CATALOG) - - # --library-issues: read-only cache scan for non-canonical filenames. - if args.library_issues: - cache = load_cache() - issues = find_library_issues(cache) - if args.format == "json" or BASIC: - print(json.dumps({"ok": True, **issues})) - else: - bracket = issues["bracket_names"] - nohyphen = issues["nohyphen_names"] - total = len(bracket) + len(nohyphen) - if not total: - console.print(Panel("[bold green]No library issues found.[/]", title="Library Issues")) - else: - from rich.table import Table - t = Table(title=f"Library Issues ({total} file(s))", show_lines=True) - t.add_column("Issue", style="yellow", width=14) - t.add_column("Current Name") - t.add_column("Canonical Name", style="green") - t.add_column("Remote", style="dim") - for e in bracket: - t.add_row("bracket ID", Path(e["path"]).name, - e["canonical_name"], e["remote"]) - for e in nohyphen: - t.add_row("no hyphen", Path(e["path"]).name, - e["canonical_name"], e["remote"]) - console.print(t) - sys.exit(0) - - # --rename-files-batch: rename multiple files, single cache write. - if args.rename_files_batch: - try: - renames = json.loads(sys.stdin.read()) - except json.JSONDecodeError as e: - print(json.dumps({"ok": False, "error": f"Invalid JSON on stdin: {e}"})) - sys.exit(1) - if not isinstance(renames, list): - print(json.dumps({"ok": False, "error": "stdin must be a JSON array"})) - sys.exit(1) - cache = load_cache() - results = rename_files_batch(renames, cache, rclone_bin=RCLONE_BIN) - ok = any(r["ok"] for r in results) - print(json.dumps({"ok": ok, "results": results})) - sys.exit(0 if ok else 1) - - # --rename-file: rename one file in a remote and patch cache. - if args.rename_file: - if not args.remote or not args.old_path or not args.new_path: - ap.error("--rename-file requires --remote, --old-path, and --new-path.") - cache = load_cache() - result = rename_file_in_remote( - args.remote, args.old_path, args.new_path, cache, rclone_bin=RCLONE_BIN - ) - print(json.dumps(result)) - sys.exit(0 if result["ok"] else 1) - - if not args.source and not args.target and not args.catalog: - ap.error("Provide at least one --source, --target, or --catalog.") - - # Scan-only mode: walk remotes, write cache, summary, exit. - if args.scan: - scan_since = None - if args.scan_since: - scan_since = parse_duration(args.scan_since) - if not scan_since: - console.print(f"[red]invalid --scan-since value: {args.scan_since!r} " - f"(expected e.g. 24h, 7d, 30m, 90s)[/]") - sys.exit(2) - cache = load_cache() - cache_meta: dict[str, dict] = {} - skipped: list[tuple[str, str]] = [] - t0 = time.perf_counter() - if BASIC: - # `--scan` resolves its default target above. Report only the - # remotes that this scan will actually walk; falling back here to - # DEFAULT_SOURCE would resurrect retired source roots in job UI. - _all_remotes = list(args.source) + list(args.target) - sys.stderr.write("SCAN_START " + json.dumps({ - "remotes": _all_remotes, "total": len(_all_remotes), - }) + "\n") - sys.stderr.flush() - entries = (cached_collect(args.source, "Source", skipped, cache, - use_cache=not args.no_cache, force_update=True, - cache_meta=cache_meta, scan_since=scan_since) - + cached_collect(args.target, "Target", skipped, cache, - use_cache=not args.no_cache, force_update=True, - cache_meta=cache_meta, scan_since=scan_since)) - if not args.no_cache: - save_cache(cache) - elapsed = time.perf_counter() - t0 - if BASIC: - sys.stderr.write(f"Scan complete: {len(entries)} files in {elapsed:.2f}s\n") - sys.stderr.write(f"Cache: {CACHE_PATH}\n" if not args.no_cache - else "Cache: (skipped, --no-cache)\n") - else: - console.print(f"[bold green]Scan complete:[/] {len(entries)} files in {elapsed:.2f}s") - if not args.no_cache: - console.print(f"[dim]Cache: {CACHE_PATH}[/]") - else: - console.print("[dim]Cache: (skipped, --no-cache)[/]") - sys.exit(0) - - skipped: list[tuple[str, str]] = [] - t0 = time.perf_counter() - - if args.search or args.name: - search_timings: dict[str, int] = {} - # If --name was passed without explicit remotes, fall back to default target - # (catalog default already injected earlier; don't let it suppress remote defaulting). - if args.name and not args.search and not args.source and not args.target: - args.target = list(DEFAULT_TARGET) - # Substring name search can't be server-side filtered on most backends — cache wins. - # Only the ID search shape benefits from quick (server-side prefix glob). - if args.name and not args.quick: - mode, reason = "cached", "name substring search — cache is faster than rclone --include" - else: - combined = list(args.search) + list(args.name) - mode, reason = choose_search_mode(combined, args.quick, args.cache) - if BASIC: - sys.stderr.write(f"Mode: {mode} ({reason})\n") - else: - mode_color = "green" if mode == "quick" else "cyan" - console.print(f"[{mode_color}]Mode: {mode}[/] [dim]({reason})[/]") - - phase_t0 = time.perf_counter() - cache = load_cache() - search_timings["cache_load_ms"] = round((time.perf_counter() - phase_t0) * 1000) - use_cache = not args.no_cache and mode == "cached" - cache_meta: dict[str, dict] = {} - phase_t0 = time.perf_counter() - if mode == "quick": - all_patterns: list[str] = [] - for raw in args.search: - all_patterns.extend(query_to_include_patterns(raw)) - all_patterns.extend(name_to_include_patterns(args.name)) - entries = [] - for r in args.source: - cache_meta[r] = {"cached": False, "age": "quick", "stale": False, "file_count": 0} - got = quick_search_remote(r, "Source", all_patterns, skipped) - entries.extend(got) - cache_meta[r]["file_count"] = len(got) - for r in args.target: - cache_meta[r] = {"cached": False, "age": "quick", "stale": False, "file_count": 0} - got = quick_search_remote(r, "Target", all_patterns, skipped) - entries.extend(got) - cache_meta[r]["file_count"] = len(got) - else: - entries = (cached_collect(args.source, "Source", skipped, cache, - use_cache, args.update, cache_meta) - + cached_collect(args.target, "Target", skipped, cache, - use_cache, args.update, cache_meta)) - search_timings["entry_collect_ms"] = round((time.perf_counter() - phase_t0) * 1000) - # Load each catalog separately so cache_meta gets the per-catalog count - # (was global total — every catalog reported the sum across all). - catalog_entries: list[FileEntry] = [] - phase_t0 = time.perf_counter() - for cp_str in args.catalog: - for cp in _expand_catalog_paths([cp_str], default_paths=DEFAULT_CATALOG): - ext = cp.suffix.lower() - if ext == ".csv": - one = load_catalog_csv(cp, skipped) - elif ext == ".xml": - one = load_catalog_xml(cp, skipped) - else: - console.print(f"[yellow]WARN: unknown catalog format '{ext}' for {cp}; skipping.[/]") - continue - catalog_entries.extend(one) - cache_meta[f"catalog:{cp.name}"] = { - "cached": False, "age": "loaded", "stale": False, - "file_count": len(one), - } - entries.extend(catalog_entries) - search_timings["catalog_load_ms"] = round((time.perf_counter() - phase_t0) * 1000) - if use_cache and args.update: - save_cache(cache) - else: - if args.cache and not args.no_cache: - cache = load_cache() - cache_meta: dict[str, dict] = {} - entries = (cached_collect(args.source, "Source", skipped, cache, - use_cache=True, force_update=False, - cache_meta=cache_meta) - + cached_collect(args.target, "Target", skipped, cache, - use_cache=True, force_update=False, - cache_meta=cache_meta)) - else: - remotes_by_label = ([("Source", r) for r in args.source] - + [("Target", r) for r in args.target]) - entries = collect_with_progress(remotes_by_label, skipped) - entries.extend(load_catalogs(args.catalog, skipped, default_paths=DEFAULT_CATALOG)) - - elapsed = time.perf_counter() - t0 - if BASIC: - sys.stderr.write(f"Scanned/loaded {len(entries)} file(s) in {elapsed:.2f}s\n") - else: - console.print(f"[dim]Scanned/loaded {len(entries)} file(s) in {elapsed:.2f}s[/]") - - if args.search or args.name: - # query_expansions: original_raw -> list of normalized IDs / wildcard patterns to look up - query_expansions: dict[str, list[str]] = {} - queries: list[str] = [] - for raw in args.search: - if RANGE_RE.search(raw): - expanded = expand_range(raw) or [] - normed: list[str] = [] - for r in expanded: - n = normalize_id(r) - if n: - normed.append(n) - if not normed: - console.print(f"[yellow]WARN: range '{raw}' produced no valid IDs.[/]") - continue - queries.append(raw) - query_expansions[raw] = normed - continue - if "*" in raw or "?" in raw: - q = raw.upper() - queries.append(q) - query_expansions[q] = [q] - continue - norm = normalize_id(raw) - if not norm: - console.print(f"[yellow]WARN: cannot parse '{raw}' as a JAV ID, skipping.[/]") - continue - # Use the raw (upper-cased) form for display so leading zeros are preserved - # (e.g. user types PRTD-027 — keep it, don't show PRTD-27). Lookup still uses - # the normalized form internally. - display = raw.upper() - queries.append(display) - query_expansions[display] = [norm] - phase_t0 = time.perf_counter() - index: dict[str, list[FileEntry]] = {} - for e in entries: - index.setdefault(e.jav_id, []).append(e) - search_timings["index_ms"] = round((time.perf_counter() - phase_t0) * 1000) - phase_t0 = time.perf_counter() - matches: dict[str, list[FileEntry]] = {} - match_traces: dict[str, dict[int, dict[str, str]]] = {} - for q in queries: - expansions = query_expansions.get(q, [q]) - hits: list[FileEntry] = [] - seen: set[int] = set() - traces: dict[int, dict[str, str]] = {} - - def add_hit(entry: FileEntry, matched_query: str) -> None: - key = id(entry) - if key in seen: - return - seen.add(key) - hits.append(entry) - traces[key] = describe_id_match(q, matched_query, entry.jav_id, len(expansions)) - - for sub in expansions: - if "*" in sub or "?" in sub: - pat = sub if "#PART" in sub.upper() else sub + "*" - for k, v in index.items(): - if fnmatch.fnmatchcase(k, pat): - for e in v: - add_hit(e, sub) - elif "#part" in sub: - for e in index.get(sub, []): - add_hit(e, sub) - else: - for e in index.get(sub, []): - add_hit(e, sub) - for k, v in index.items(): - if k.startswith(sub + "#part"): - for e in v: - add_hit(e, sub) - matches[q] = hits - match_traces[q] = traces - search_timings["match_ms"] = round((time.perf_counter() - phase_t0) * 1000) - if args.format == "json": - # Structured output for tools that consume search results (e.g. the rclonex - # Brave extension). Includes everything needed to drive a UI: per-query hits - # with source/remote/path/size/mod_time, plus name-match block + skipped. - name_hits_json: list[FileEntry] = [] - if args.name: - for e in entries: - if name_match(Path(e.path).stem, args.name): - name_hits_json.append(e) - out_obj = { - "queries": [ - { - "query": q, - "hits": [ - {"source": e.source, "remote": e.remote, "path": e.path, - "full_path": e.full_path, "size": e.size, - "size_human": human_size(e.size), - "mod_time": e.mod_time, "jav_id": e.jav_id, - **match_traces.get(q, {}).get(id(e), {})} - for e in sorted(matches.get(q, []), key=lambda x: (x.jav_id, x.path.lower())) - ], - } - for q in queries - ], - "name_matches": [ - {"source": e.source, "remote": e.remote, "path": e.path, - "full_path": e.full_path, "size": e.size, - "size_human": human_size(e.size), "mod_time": e.mod_time, - "jav_id": e.jav_id, "match_kind": "name", - "match_reason": "Filename search", "match_confidence": "broad", - "matched_query": ", ".join(args.name), "matched_id": e.jav_id} - for e in sorted(name_hits_json, key=lambda x: (x.jav_id, x.path.lower())) - ], - "name_tokens": list(args.name), - "cache_meta": cache_meta, - "skipped_count": len(skipped), - "elapsed_sec": round(time.perf_counter() - t0, 3), - "timings": search_timings, - } - print(json.dumps(out_obj)) - id_ok = (not queries) or all(matches.values()) - name_ok = (not args.name) or bool(name_hits_json) - sys.exit(0 if (id_ok and name_ok) else 1) - if queries: - if BASIC: - print(render_search_plain(matches, queries, cache_meta)) - else: - render_search(matches, queries, cache_meta) - # --name results as a separate block - name_hits: list[FileEntry] = [] - if args.name: - for e in entries: - if name_match(Path(e.path).stem, args.name): - name_hits.append(e) - if BASIC: - print(render_name_matches_plain(name_hits, args.name, cache_meta)) - else: - render_name_matches(name_hits, args.name, cache_meta) - # Exit code: 0 if every search query had hits AND name-search (if used) returned hits. - id_ok = (not queries) or all(matches.values()) - name_ok = (not args.name) or bool(name_hits) - sys.exit(0 if (id_ok and name_ok) else 1) - - dupes = find_dupes(entries) - variant_alerts = find_variant_alerts(entries) - if args.format == "json" and BASIC: - print(json.dumps(dupes_to_obj(dupes, skipped, variant_alerts))) - sys.exit(0) - if BASIC: - print(render_dupes_plain(dupes, skipped, variant_alerts)) - else: - render_dupes(dupes, skipped, variant_alerts) - - if args.format != "console": - out_dir = Path(args.output_dir) - out_dir.mkdir(parents=True, exist_ok=True) - stamp = datetime.now().strftime("%Y%m%d-%H%M%S") - targets = {"txt", "csv", "json"} if args.format == "all" else {args.format} - if "txt" in targets: - write_txt(out_dir / f"dupes-{stamp}.txt", dupes, skipped) - if "csv" in targets: - write_csv(out_dir / f"dupes-{stamp}.csv", dupes) - if "json" in targets: - write_json(out_dir / f"dupes-{stamp}.json", dupes, skipped, variant_alerts) - console.print(f"[dim]Reports written to {out_dir}[/]") +# Re-export the public surface so `importlib.spec_from_file_location("rcjav_script", "rc-jav.py")` +# style loaders still find FileEntry, extract_id, decide_keep_with_reason, etc. via attribute access. +from rcjav import * # noqa: F401,F403 +from rcjav.cli import main if __name__ == "__main__": try: main() except KeyboardInterrupt: - console.print("\n[yellow]Aborted by user (Ctrl+C). Cache not written for in-flight scans.[/]") + sys.stderr.write("\nAborted by user (Ctrl+C). Cache not written for in-flight scans.\n") sys.exit(130) diff --git a/rcjav/cli.py b/rcjav/cli.py new file mode 100644 index 0000000..d9a30ed --- /dev/null +++ b/rcjav/cli.py @@ -0,0 +1,845 @@ +#!/usr/bin/env python3 +"""Scan rclone remotes for duplicate JAV files grouped by ID.""" +from __future__ import annotations + +import argparse +import csv +import fnmatch +import json +import os +import re +import subprocess +import sys +import threading +import time +import xml.etree.ElementTree as ET +from dataclasses import dataclass, asdict +from datetime import datetime +from pathlib import Path +from typing import Iterable + +from rich.console import Console +from rich.panel import Panel +from rich.progress import ( + BarColumn, + MofNCompleteColumn, + Progress, + SpinnerColumn, + TextColumn, + TimeElapsedColumn, + TimeRemainingColumn, +) +from rich.table import Table +from rich.text import Text + +from rcjav.model import FileEntry +from rcjav import ids as _rcjav_ids +from rcjav.ids import ( + PRIMARY_ID_RE, + FALLBACK_ID_RE, + COMPOUND_ID_RE, + RANGE_RE, + BUILTIN_PART_RES, + configure_part_patterns, + detect_part, + detect_part_from_stem, + part_key, + extract_id, + normalize_id, + describe_id_match, + expand_range, + _VARIANT_SUFFIX_RE, + _RES_LABEL_RE, + _RESOLUTION_TAG_RE, + _BRACKET_ID_RE, + _NOHYPHEN_ID_RE, + _VIDEO_EXTS, + _LOWEST_KEEP_PRIORITY_EXTS, +) + + +# PART_RES is rebound by configure_part_patterns(); always read it dynamically +# from the rcjav.ids module rather than capturing a stale binding at import time. +def _current_part_res(): + return _rcjav_ids.PART_RES + + +from rcjav.rclone_io import ( + RCLONE_BIN, + DURATION_RE, + set_basic as _set_rclone_basic, + set_rclone_bin as _set_rclone_bin, + quick_search_remote, + choose_search_mode, + name_to_include_patterns, + name_match, + query_to_include_patterns, + remote_file_count, + parse_duration, + walk_remote, +) +from rcjav import output as _output +from rcjav.output import ( + human_size, + ansi, + ANSI_RESET, + ANSI_GREEN, + ANSI_RED, + ANSI_YELLOW, + ANSI_CYAN, + ANSI_DIM, + ANSI_BOLD, + strip_markup, + BasicProgress, + make_progress, + render_banner, + render_search, + render_name_matches, + render_name_matches_plain, + render_dupes, + render_banner_plain, + render_search_plain, + render_dupes_plain, + write_txt, + write_csv, + write_json, + describe_skipped_id, + dupes_to_obj, + set_use_ansi as _set_output_use_ansi, + set_basic as _set_output_basic, + set_console_no_color as _set_output_no_color, +) + +# rc-jav.py keeps its own local rich Console for the prints that haven't +# moved to rcjav.output yet (collectors, main()). When --no-color is in +# play we rebind both this and rcjav.output's console. +console = Console() + + +# Mirror of rcjav.rclone_io.BASIC for in-tree readers that haven't been +# updated yet (output renderers, BasicProgress checks in main()). Set in +# main() via both this name and _set_rclone_basic(). +BASIC = False # set by --basic +console = Console() # replaced in main() if --no-color + + +# Default remotes used when --search is invoked without explicit --source/--target. +DEFAULT_SOURCE = ["cq:personal-files/ClearJAV"] +DEFAULT_TARGET = ["cq:personal-files/JAV/TMP"] + +# Default WinCatalog export folder (or specific files). Folder entries auto-discover *.csv / *.xml. +DEFAULT_CATALOG: list[str] = [str(Path(__file__).resolve().parent / "wincatalog")] + +from rcjav.catalog import ( + CATALOG_COL_NAME, + CATALOG_COL_PATH, + CATALOG_COL_SIZE, + CATALOG_COL_DISC, + normalize_catalog_path, + load_catalog_csv, + load_catalog_xml, + load_catalogs, + _expand_catalog_paths, +) +from rcjav.cache import ( + CACHE_PATH, + CACHE_VERSION, + CACHE_STALE_HOURS, + load_cache, + save_cache, + cache_age_hours, + fmt_age, +) + +from rcjav.dupes import ( + DEFAULT_KEEP_RANKING, + set_keep_ranking, + decide_keep_with_reason, + decide_keep, + find_dupes, + describe_dupe_risks, + find_variant_alerts, +) +from rcjav.library import ( + find_library_issues, + rename_file_in_remote, + rename_files_batch, + _bracket_to_canonical, + _nohyphen_to_canonical, +) + +CONFIG_PATH = Path(__file__).resolve().parent / "config.json" + +def load_config() -> dict: + if not CONFIG_PATH.exists(): + return {} + try: + data = json.loads(CONFIG_PATH.read_text(encoding="utf-8")) + if not isinstance(data, dict): + return {} + return data + except (json.JSONDecodeError, OSError): + return {} + + +def save_config(cfg: dict) -> None: + tmp = CONFIG_PATH.with_suffix(CONFIG_PATH.suffix + ".tmp") + tmp.write_text(json.dumps(cfg, indent=2), encoding="utf-8") + os.replace(tmp, CONFIG_PATH) + + +# ---------- collectors ---------- + +def collect_with_progress(remotes_by_label: list[tuple[str, str]], + skipped: list[tuple[str, str]] + ) -> list[FileEntry]: + """Dupe-mode collect — every remote freshly walked with progress.""" + out: list[FileEntry] = [] + if not remotes_by_label: + return out + with make_progress() as progress: + tasks = {(label, r): progress.add_task(f"{label} {r}", total=1) + for label, r in remotes_by_label} + for (label, r), tid in tasks.items(): + entries, _ = walk_remote(r, label, skipped, progress, tid) + out.extend(entries) + return out + + +def cached_collect(remotes: list[str], source_label: str, + skipped: list[tuple[str, str]], + cache: dict, use_cache: bool, force_update: bool, + cache_meta: dict[str, dict], + scan_since: str | None = None) -> list[FileEntry]: + """Search-mode collect with cache. Always recursive. + scan_since: rclone duration string (`24h`, `7d`). When set during a forced + update, only files modified within the window are walked and merged on top + of the existing cache entry; files older than the window keep their cached + record. If there's no prior cache entry for a remote, falls through to a + full scan.""" + out: list[FileEntry] = [] + to_scan: list[str] = [] + to_incremental: list[tuple[str, dict]] = [] # (remote, existing_entry) + for r in remotes: + if scan_since and force_update and use_cache: + existing = cache["remotes"].get(r) + if existing: + to_incremental.append((r, existing)) + continue + # No prior cache for this remote -> can't be incremental, fall back. + entry = cache["remotes"].get(r) if use_cache and not force_update else None + if entry: + age = cache_age_hours(entry["scanned_at"]) + age_str = fmt_age(age) if age is not None else "?" + stale = age is not None and age > CACHE_STALE_HOURS + cache_meta[r] = {"cached": True, "age": age_str, "stale": stale, + "file_count": len(entry["files"])} + for f in entry["files"]: + out.append(FileEntry(source=source_label, remote=r, path=f["path"], + size=f["size"], mod_time=f.get("mod_time", ""), + jav_id=f["jav_id"])) + for s in entry.get("skipped", []): + skipped.append((r, s)) + else: + to_scan.append(r) + + if to_scan: + with make_progress() as progress: + tids = {r: progress.add_task(f"{source_label} {r}", total=1) for r in to_scan} + for r_idx, r in enumerate(to_scan): + _total: int | None = None + if BASIC: + # Emit SCAN_REMOTE_START immediately so the UI shows the remote name. + # Then probe the file count; once known, emit SCAN_REMOTE_COUNTED so + # the UI can show "N / total" without waiting for the first 100 files. + sys.stderr.write("SCAN_REMOTE_START " + json.dumps({ + "remote": r, "label": source_label, + "index": r_idx + 1, "of": len(to_scan), + "total": None, + }) + "\n") + sys.stderr.flush() + _total = remote_file_count(r) + sys.stderr.write("SCAN_REMOTE_COUNTED " + json.dumps({ + "remote": r, "total": _total, + }) + "\n") + sys.stderr.flush() + fresh, local_skipped = walk_remote(r, source_label, skipped, progress, tids[r], + _total_override=_total) + out.extend(fresh) + cache_meta[r] = {"cached": False, "age": "fresh", "stale": False, + "file_count": len(fresh)} + if use_cache: + cache["remotes"][r] = { + "scanned_at": datetime.now().astimezone().isoformat(), + "recursive": True, + "files": [{"path": e.path, "size": e.size, "mod_time": e.mod_time, + "jav_id": e.jav_id} for e in fresh], + "skipped": local_skipped, + } + if BASIC: + sys.stderr.write("SCAN_PROGRESS " + json.dumps({ + "remote": r, "label": source_label, + "files": len(fresh), "files_total": len(out), + }) + "\n") + sys.stderr.flush() + + if to_incremental: + with make_progress() as progress: + tids = {r: progress.add_task(f"{source_label} {r} (since {scan_since})", total=1) + for r, _ in to_incremental} + for r_idx, (r, existing) in enumerate(to_incremental): + if BASIC: + sys.stderr.write("SCAN_REMOTE_START " + json.dumps({ + "remote": r, "label": source_label, + "index": r_idx + 1, "of": len(to_incremental), + "total": None, "incremental": True, + }) + "\n") + sys.stderr.flush() + fresh, local_skipped = walk_remote( + r, source_label, skipped, progress, tids[r], max_age=scan_since, + ) + # Merge: replace entries at paths we just walked, keep all others. + new_paths = {e.path for e in fresh} + old_files = [f for f in existing.get("files", []) + if f["path"] not in new_paths] + merged_files = old_files + [ + {"path": e.path, "size": e.size, "mod_time": e.mod_time, + "jav_id": e.jav_id} for e in fresh + ] + # Merge skipped lists (de-dupe). + old_skipped = set(existing.get("skipped", [])) + old_skipped.update(local_skipped) + # Emit FileEntry for everything (old + new) so the caller sees the + # full set, not just deltas. + for f in merged_files: + out.append(FileEntry(source=source_label, remote=r, path=f["path"], + size=f["size"], mod_time=f.get("mod_time", ""), + jav_id=f["jav_id"])) + for s in old_skipped: + skipped.append((r, s)) + cache_meta[r] = { + "cached": False, "age": f"incremental {scan_since}", + "stale": False, "file_count": len(merged_files), + "added_or_updated": len(fresh), + } + if use_cache: + cache["remotes"][r] = { + "scanned_at": datetime.now().astimezone().isoformat(), + "recursive": True, + "files": merged_files, + "skipped": sorted(old_skipped), + } + if BASIC: + sys.stderr.write("SCAN_PROGRESS " + json.dumps({ + "remote": r, "label": source_label, + "files": len(fresh), "files_total": len(out), + "incremental": True, + "file_count": len(merged_files), + }) + "\n") + sys.stderr.flush() + return out + + +# ---------- main ---------- + +def main(): + ap = argparse.ArgumentParser(description="Report duplicate JAV files across rclone remotes (read-only).") + ap.add_argument("--source", "-s", action="append", default=[], metavar="REMOTE", + help="Source remote path (priority — wins dupes regardless of size). Repeatable.") + ap.add_argument("--target", "-t", action="append", default=[], metavar="REMOTE", + help="Target remote path (non-priority — largest size wins among targets). Repeatable.") + ap.add_argument("--format", choices=["console", "txt", "csv", "json", "all"], + default="console") + ap.add_argument("--output-dir", default="./reports", help="Where to write txt/csv/json.") + ap.add_argument("--no-color", action="store_true") + ap.add_argument("--rclone-bin", default="rclone", + help="Path to rclone executable (default: 'rclone' on PATH).") + ap.add_argument("--search", action="append", default=[], metavar="ID", + help="Search mode: look up a JAV ID (e.g. SSIS-001). Repeatable. " + "If no --source/--target given, default target is used.") + ap.add_argument("--name", action="append", default=[], metavar="STR", + help="Substring/glob search against filename. Case-insensitive. " + "Repeatable; OR semantics (any token match = hit). " + "Supports * and ? wildcards. Use quotes for spaces.") + ap.add_argument("--update", "-u", action="store_true", + help="Search mode: force re-scan and overwrite cache for requested remotes.") + ap.add_argument("--no-cache", action="store_true", + help="Search mode: bypass cache entirely (no read, no write).") + ap.add_argument("--quick", "-q", action="store_true", + help="Force quick mode: skip cache, query rclone directly with --include glob. " + "Default is auto: single exact IDs use quick, wildcards/ranges/multi use cached.") + ap.add_argument("--cache", action="store_true", + help="Force cached mode (opposite of --quick).") + ap.add_argument("--save", action="store_true", + help="Persist the --source / --target / --catalog values you passed " + "as new defaults in config.json next to the script. " + "Only keys you explicitly passed are saved.") + ap.add_argument("--scan", action="store_true", + help="Walk configured remotes, refresh cache, exit. No search/dupe output. " + "Default scope: DEFAULT_TARGET. Override with --source/--target. " + "Always overwrites cache. Suitable for Task Scheduler / cron.") + ap.add_argument("--scan-since", metavar="DURATION", + help="Incremental scan: only walk files modified within DURATION " + "(e.g. 24h, 7d, 30m, 90s). Merges new/changed entries on top of " + "the existing cache; old entries are preserved. Falls back to a " + "full scan if there's no prior cache for a remote. Requires --scan.") + ap.add_argument("--catalog", action="append", default=[], metavar="PATH", + help="Path to a WinCatalog CSV or XML export. Repeatable. " + "Listed under 'Catalog' in results (informational, never KEEP/DELETE?).") + ap.add_argument("--part-pattern", action="append", default=[], metavar="REGEX", + help="Extra multipart filename regex. Repeatable; first capture group must be the part number. " + "Patterns run against the filename stem after built-in part detectors.") + ap.add_argument("--library-issues", action="store_true", + help="Report non-canonical filenames (bracket-wrapped IDs, no-hyphen IDs). " + "Reads from cache. Outputs JSON when --format json, plain otherwise.") + ap.add_argument("--rename-file", action="store_true", + help="Rename one file in a remote and patch cache. " + "Requires --remote, --old-path, --new-path. Outputs JSON.") + ap.add_argument("--rename-files-batch", action="store_true", + help="Rename multiple files in one call, writing cache once. " + "Reads JSON array of {remote, old_path, new_path} from stdin. Outputs JSON.") + ap.add_argument("--remote", metavar="REMOTE", + help="Remote path root for --rename-file (e.g. cq:JAV).") + ap.add_argument("--old-path", metavar="PATH", + help="Relative path of the file to rename (within --remote).") + ap.add_argument("--new-path", metavar="PATH", + help="New relative path after rename (within --remote).") + ap.add_argument("--basic", action="store_true", + help="Plain text output, no rich tables/panels/progress bars. " + "Useful for piping or low-bandwidth terminals.") + ap.add_argument("--clearjav", action="store_true", + help="Shortcut: use DEFAULT_SOURCE as --source and DEFAULT_TARGET as --target, " + "Equivalent to " + "`--source cq:personal-files/ClearJAV --target cq:personal-files/JAV/TMP`.") + args = ap.parse_args() + + global console, BASIC, DEFAULT_SOURCE, DEFAULT_TARGET, DEFAULT_CATALOG + _set_rclone_bin(args.rclone_bin) + BASIC = args.basic or args.format == "json" + _set_output_basic(BASIC) + + # Apply persisted config overrides BEFORE defaults are consulted. + cfg = load_config() + if "default_source" in cfg: + DEFAULT_SOURCE = list(cfg["default_source"]) + if "default_target" in cfg: + DEFAULT_TARGET = list(cfg["default_target"]) + if "default_catalog" in cfg: + DEFAULT_CATALOG = list(cfg["default_catalog"]) + set_keep_ranking(cfg.get("keep_ranking") or {}) + part_patterns = list(cfg.get("part_patterns") or []) + list(args.part_pattern) + pattern_errors = configure_part_patterns(part_patterns) + if pattern_errors: + for err in pattern_errors: + console.print(f"[red]invalid part pattern:[/] {err}") + sys.exit(2) + + # --save: persist explicitly-passed values, exit. + if args.save: + if not (args.source or args.target or args.catalog or args.part_pattern): + console.print("[red]--save needs at least one --source/--target/--catalog/--part-pattern value to persist.[/]") + sys.exit(2) + new_cfg = dict(cfg) + if args.source: + new_cfg["default_source"] = list(args.source) + if args.target: + new_cfg["default_target"] = list(args.target) + if args.catalog: + new_cfg["default_catalog"] = list(args.catalog) + if args.part_pattern: + new_cfg["part_patterns"] = list(args.part_pattern) + save_config(new_cfg) + console.print(f"[green]Saved to {CONFIG_PATH}:[/]") + for k in ("default_source", "default_target", "default_catalog", "part_patterns"): + if k in new_cfg: + console.print(f" {k} = {new_cfg[k]}") + sys.exit(0) + _set_output_use_ansi(not args.no_color) + if args.no_color or BASIC: + console = Console(no_color=True, color_system=None, highlight=False) + _set_output_no_color() + + # Search mode: defaults kick in if no remotes specified. + if args.clearjav: + if not args.source: + args.source = list(DEFAULT_SOURCE) + if not args.target: + args.target = list(DEFAULT_TARGET) + + if args.search and not args.source and not args.target: + args.target = list(DEFAULT_TARGET) + + # --scan: default to DEFAULT_TARGET only, always overwrite cache. + if args.scan: + if not args.source and not args.target: + args.target = list(DEFAULT_TARGET) + args.update = True + + # Use default catalog(s) if user passed none. + if not args.catalog and DEFAULT_CATALOG: + args.catalog = list(DEFAULT_CATALOG) + + # --library-issues: read-only cache scan for non-canonical filenames. + if args.library_issues: + cache = load_cache() + issues = find_library_issues(cache) + if args.format == "json" or BASIC: + print(json.dumps({"ok": True, **issues})) + else: + bracket = issues["bracket_names"] + nohyphen = issues["nohyphen_names"] + total = len(bracket) + len(nohyphen) + if not total: + console.print(Panel("[bold green]No library issues found.[/]", title="Library Issues")) + else: + from rich.table import Table + t = Table(title=f"Library Issues ({total} file(s))", show_lines=True) + t.add_column("Issue", style="yellow", width=14) + t.add_column("Current Name") + t.add_column("Canonical Name", style="green") + t.add_column("Remote", style="dim") + for e in bracket: + t.add_row("bracket ID", Path(e["path"]).name, + e["canonical_name"], e["remote"]) + for e in nohyphen: + t.add_row("no hyphen", Path(e["path"]).name, + e["canonical_name"], e["remote"]) + console.print(t) + sys.exit(0) + + # --rename-files-batch: rename multiple files, single cache write. + if args.rename_files_batch: + try: + renames = json.loads(sys.stdin.read()) + except json.JSONDecodeError as e: + print(json.dumps({"ok": False, "error": f"Invalid JSON on stdin: {e}"})) + sys.exit(1) + if not isinstance(renames, list): + print(json.dumps({"ok": False, "error": "stdin must be a JSON array"})) + sys.exit(1) + cache = load_cache() + results = rename_files_batch(renames, cache, rclone_bin=RCLONE_BIN) + ok = any(r["ok"] for r in results) + print(json.dumps({"ok": ok, "results": results})) + sys.exit(0 if ok else 1) + + # --rename-file: rename one file in a remote and patch cache. + if args.rename_file: + if not args.remote or not args.old_path or not args.new_path: + ap.error("--rename-file requires --remote, --old-path, and --new-path.") + cache = load_cache() + result = rename_file_in_remote( + args.remote, args.old_path, args.new_path, cache, rclone_bin=RCLONE_BIN + ) + print(json.dumps(result)) + sys.exit(0 if result["ok"] else 1) + + if not args.source and not args.target and not args.catalog: + ap.error("Provide at least one --source, --target, or --catalog.") + + # Scan-only mode: walk remotes, write cache, summary, exit. + if args.scan: + scan_since = None + if args.scan_since: + scan_since = parse_duration(args.scan_since) + if not scan_since: + console.print(f"[red]invalid --scan-since value: {args.scan_since!r} " + f"(expected e.g. 24h, 7d, 30m, 90s)[/]") + sys.exit(2) + cache = load_cache() + cache_meta: dict[str, dict] = {} + skipped: list[tuple[str, str]] = [] + t0 = time.perf_counter() + if BASIC: + # `--scan` resolves its default target above. Report only the + # remotes that this scan will actually walk; falling back here to + # DEFAULT_SOURCE would resurrect retired source roots in job UI. + _all_remotes = list(args.source) + list(args.target) + sys.stderr.write("SCAN_START " + json.dumps({ + "remotes": _all_remotes, "total": len(_all_remotes), + }) + "\n") + sys.stderr.flush() + entries = (cached_collect(args.source, "Source", skipped, cache, + use_cache=not args.no_cache, force_update=True, + cache_meta=cache_meta, scan_since=scan_since) + + cached_collect(args.target, "Target", skipped, cache, + use_cache=not args.no_cache, force_update=True, + cache_meta=cache_meta, scan_since=scan_since)) + if not args.no_cache: + save_cache(cache) + elapsed = time.perf_counter() - t0 + if BASIC: + sys.stderr.write(f"Scan complete: {len(entries)} files in {elapsed:.2f}s\n") + sys.stderr.write(f"Cache: {CACHE_PATH}\n" if not args.no_cache + else "Cache: (skipped, --no-cache)\n") + else: + console.print(f"[bold green]Scan complete:[/] {len(entries)} files in {elapsed:.2f}s") + if not args.no_cache: + console.print(f"[dim]Cache: {CACHE_PATH}[/]") + else: + console.print("[dim]Cache: (skipped, --no-cache)[/]") + sys.exit(0) + + skipped: list[tuple[str, str]] = [] + t0 = time.perf_counter() + + if args.search or args.name: + search_timings: dict[str, int] = {} + # If --name was passed without explicit remotes, fall back to default target + # (catalog default already injected earlier; don't let it suppress remote defaulting). + if args.name and not args.search and not args.source and not args.target: + args.target = list(DEFAULT_TARGET) + # Substring name search can't be server-side filtered on most backends — cache wins. + # Only the ID search shape benefits from quick (server-side prefix glob). + if args.name and not args.quick: + mode, reason = "cached", "name substring search — cache is faster than rclone --include" + else: + combined = list(args.search) + list(args.name) + mode, reason = choose_search_mode(combined, args.quick, args.cache) + if BASIC: + sys.stderr.write(f"Mode: {mode} ({reason})\n") + else: + mode_color = "green" if mode == "quick" else "cyan" + console.print(f"[{mode_color}]Mode: {mode}[/] [dim]({reason})[/]") + + phase_t0 = time.perf_counter() + cache = load_cache() + search_timings["cache_load_ms"] = round((time.perf_counter() - phase_t0) * 1000) + use_cache = not args.no_cache and mode == "cached" + cache_meta: dict[str, dict] = {} + phase_t0 = time.perf_counter() + if mode == "quick": + all_patterns: list[str] = [] + for raw in args.search: + all_patterns.extend(query_to_include_patterns(raw)) + all_patterns.extend(name_to_include_patterns(args.name)) + entries = [] + for r in args.source: + cache_meta[r] = {"cached": False, "age": "quick", "stale": False, "file_count": 0} + got = quick_search_remote(r, "Source", all_patterns, skipped) + entries.extend(got) + cache_meta[r]["file_count"] = len(got) + for r in args.target: + cache_meta[r] = {"cached": False, "age": "quick", "stale": False, "file_count": 0} + got = quick_search_remote(r, "Target", all_patterns, skipped) + entries.extend(got) + cache_meta[r]["file_count"] = len(got) + else: + entries = (cached_collect(args.source, "Source", skipped, cache, + use_cache, args.update, cache_meta) + + cached_collect(args.target, "Target", skipped, cache, + use_cache, args.update, cache_meta)) + search_timings["entry_collect_ms"] = round((time.perf_counter() - phase_t0) * 1000) + # Load each catalog separately so cache_meta gets the per-catalog count + # (was global total — every catalog reported the sum across all). + catalog_entries: list[FileEntry] = [] + phase_t0 = time.perf_counter() + for cp_str in args.catalog: + for cp in _expand_catalog_paths([cp_str], default_paths=DEFAULT_CATALOG): + ext = cp.suffix.lower() + if ext == ".csv": + one = load_catalog_csv(cp, skipped) + elif ext == ".xml": + one = load_catalog_xml(cp, skipped) + else: + console.print(f"[yellow]WARN: unknown catalog format '{ext}' for {cp}; skipping.[/]") + continue + catalog_entries.extend(one) + cache_meta[f"catalog:{cp.name}"] = { + "cached": False, "age": "loaded", "stale": False, + "file_count": len(one), + } + entries.extend(catalog_entries) + search_timings["catalog_load_ms"] = round((time.perf_counter() - phase_t0) * 1000) + if use_cache and args.update: + save_cache(cache) + else: + if args.cache and not args.no_cache: + cache = load_cache() + cache_meta: dict[str, dict] = {} + entries = (cached_collect(args.source, "Source", skipped, cache, + use_cache=True, force_update=False, + cache_meta=cache_meta) + + cached_collect(args.target, "Target", skipped, cache, + use_cache=True, force_update=False, + cache_meta=cache_meta)) + else: + remotes_by_label = ([("Source", r) for r in args.source] + + [("Target", r) for r in args.target]) + entries = collect_with_progress(remotes_by_label, skipped) + entries.extend(load_catalogs(args.catalog, skipped, default_paths=DEFAULT_CATALOG)) + + elapsed = time.perf_counter() - t0 + if BASIC: + sys.stderr.write(f"Scanned/loaded {len(entries)} file(s) in {elapsed:.2f}s\n") + else: + console.print(f"[dim]Scanned/loaded {len(entries)} file(s) in {elapsed:.2f}s[/]") + + if args.search or args.name: + # query_expansions: original_raw -> list of normalized IDs / wildcard patterns to look up + query_expansions: dict[str, list[str]] = {} + queries: list[str] = [] + for raw in args.search: + if RANGE_RE.search(raw): + expanded = expand_range(raw) or [] + normed: list[str] = [] + for r in expanded: + n = normalize_id(r) + if n: + normed.append(n) + if not normed: + console.print(f"[yellow]WARN: range '{raw}' produced no valid IDs.[/]") + continue + queries.append(raw) + query_expansions[raw] = normed + continue + if "*" in raw or "?" in raw: + q = raw.upper() + queries.append(q) + query_expansions[q] = [q] + continue + norm = normalize_id(raw) + if not norm: + console.print(f"[yellow]WARN: cannot parse '{raw}' as a JAV ID, skipping.[/]") + continue + # Use the raw (upper-cased) form for display so leading zeros are preserved + # (e.g. user types PRTD-027 — keep it, don't show PRTD-27). Lookup still uses + # the normalized form internally. + display = raw.upper() + queries.append(display) + query_expansions[display] = [norm] + phase_t0 = time.perf_counter() + index: dict[str, list[FileEntry]] = {} + for e in entries: + index.setdefault(e.jav_id, []).append(e) + search_timings["index_ms"] = round((time.perf_counter() - phase_t0) * 1000) + phase_t0 = time.perf_counter() + matches: dict[str, list[FileEntry]] = {} + match_traces: dict[str, dict[int, dict[str, str]]] = {} + for q in queries: + expansions = query_expansions.get(q, [q]) + hits: list[FileEntry] = [] + seen: set[int] = set() + traces: dict[int, dict[str, str]] = {} + + def add_hit(entry: FileEntry, matched_query: str) -> None: + key = id(entry) + if key in seen: + return + seen.add(key) + hits.append(entry) + traces[key] = describe_id_match(q, matched_query, entry.jav_id, len(expansions)) + + for sub in expansions: + if "*" in sub or "?" in sub: + pat = sub if "#PART" in sub.upper() else sub + "*" + for k, v in index.items(): + if fnmatch.fnmatchcase(k, pat): + for e in v: + add_hit(e, sub) + elif "#part" in sub: + for e in index.get(sub, []): + add_hit(e, sub) + else: + for e in index.get(sub, []): + add_hit(e, sub) + for k, v in index.items(): + if k.startswith(sub + "#part"): + for e in v: + add_hit(e, sub) + matches[q] = hits + match_traces[q] = traces + search_timings["match_ms"] = round((time.perf_counter() - phase_t0) * 1000) + if args.format == "json": + # Structured output for tools that consume search results (e.g. the rclonex + # Brave extension). Includes everything needed to drive a UI: per-query hits + # with source/remote/path/size/mod_time, plus name-match block + skipped. + name_hits_json: list[FileEntry] = [] + if args.name: + for e in entries: + if name_match(Path(e.path).stem, args.name): + name_hits_json.append(e) + out_obj = { + "queries": [ + { + "query": q, + "hits": [ + {"source": e.source, "remote": e.remote, "path": e.path, + "full_path": e.full_path, "size": e.size, + "size_human": human_size(e.size), + "mod_time": e.mod_time, "jav_id": e.jav_id, + **match_traces.get(q, {}).get(id(e), {})} + for e in sorted(matches.get(q, []), key=lambda x: (x.jav_id, x.path.lower())) + ], + } + for q in queries + ], + "name_matches": [ + {"source": e.source, "remote": e.remote, "path": e.path, + "full_path": e.full_path, "size": e.size, + "size_human": human_size(e.size), "mod_time": e.mod_time, + "jav_id": e.jav_id, "match_kind": "name", + "match_reason": "Filename search", "match_confidence": "broad", + "matched_query": ", ".join(args.name), "matched_id": e.jav_id} + for e in sorted(name_hits_json, key=lambda x: (x.jav_id, x.path.lower())) + ], + "name_tokens": list(args.name), + "cache_meta": cache_meta, + "skipped_count": len(skipped), + "elapsed_sec": round(time.perf_counter() - t0, 3), + "timings": search_timings, + } + print(json.dumps(out_obj)) + id_ok = (not queries) or all(matches.values()) + name_ok = (not args.name) or bool(name_hits_json) + sys.exit(0 if (id_ok and name_ok) else 1) + if queries: + if BASIC: + print(render_search_plain(matches, queries, cache_meta)) + else: + render_search(matches, queries, cache_meta) + # --name results as a separate block + name_hits: list[FileEntry] = [] + if args.name: + for e in entries: + if name_match(Path(e.path).stem, args.name): + name_hits.append(e) + if BASIC: + print(render_name_matches_plain(name_hits, args.name, cache_meta)) + else: + render_name_matches(name_hits, args.name, cache_meta) + # Exit code: 0 if every search query had hits AND name-search (if used) returned hits. + id_ok = (not queries) or all(matches.values()) + name_ok = (not args.name) or bool(name_hits) + sys.exit(0 if (id_ok and name_ok) else 1) + + dupes = find_dupes(entries) + variant_alerts = find_variant_alerts(entries) + if args.format == "json" and BASIC: + print(json.dumps(dupes_to_obj(dupes, skipped, variant_alerts))) + sys.exit(0) + if BASIC: + print(render_dupes_plain(dupes, skipped, variant_alerts)) + else: + render_dupes(dupes, skipped, variant_alerts) + + if args.format != "console": + out_dir = Path(args.output_dir) + out_dir.mkdir(parents=True, exist_ok=True) + stamp = datetime.now().strftime("%Y%m%d-%H%M%S") + targets = {"txt", "csv", "json"} if args.format == "all" else {args.format} + if "txt" in targets: + write_txt(out_dir / f"dupes-{stamp}.txt", dupes, skipped) + if "csv" in targets: + write_csv(out_dir / f"dupes-{stamp}.csv", dupes) + if "json" in targets: + write_json(out_dir / f"dupes-{stamp}.json", dupes, skipped, variant_alerts) + console.print(f"[dim]Reports written to {out_dir}[/]") + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + console.print("\n[yellow]Aborted by user (Ctrl+C). Cache not written for in-flight scans.[/]") + sys.exit(130)