commit e029e898e9cda6f838e12068480dab9cf9df10bc Author: admin Date: Fri May 22 21:39:09 2026 +0200 Initial snapshot before step 10 package split diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..432eb49 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +__pycache__/ +*.pyc +scan-cancel.flag +cache.json +cache.json.tmp +reports/ +.claude/ diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..dd71142 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,71 @@ +# rc-jav (Python CLI) + +Session memory for Codex. Read before making changes here. + +## What this is + +A read-only rclone library comparison + search CLI. Compares `cq:JAV` remote (rclone crypt) against itself (dupe detection) or against external WinCatalog CSV/XML exports. Powers the rclone-jav Brave extension via native messaging. + +## Architecture + +``` +rc-jav.py + ├── reads config.json (default_target etc.) + ├── reads cache.json (per-remote file index, written by --scan) + ├── shells out to: rclone lsf / rclone lsjson / rclone size --json + ├── extract_id() per filename → normalized ID with optional #partN / variant suffix + ├── two query modes: --quick (live rclone --include glob) and cached (uses cache.json) + └── output: rich tables (default) | --basic plain | --format json (for extension) +``` + +## Files + +``` +D:\DEV\Project\rclone-jav\ +├── rc-jav.py single-file CLI +├── config.json default_source/target/catalog (user-editable via --save) +├── cache.json scanned remote file index (written by --scan) +├── wincatalog\ drop WinCatalog CSV/XML exports here (auto-loaded) +├── TODO.md deferred work +└── README.md +``` + +## Companion project + +`D:\DEV\Extensions\Production\rclone-jav\` (PC 1) / `D:\DEV\Extensions\Staging\rclone-jav\` (PC 2) — Brave extension + native messaging host that shells out to `rc-jav.py` for searches. + +## ID normalization + +- `extract_id()` chops trailing single letters (e.g. `IBW-902z.mp4` → `IBW-902`). Decision is intentional — see extension's AGENTS.md "Decision log". +- JAV IDs are canonicalized with at least 3 digits (`ABC-27` → `ABC-027`); 4+ digit IDs keep their width (`ABCD-1294`). User expects real JAV IDs to be `ABC-027`, never `ABC-27` or `ABC-0027`. +- Part suffix detection: `_1`, `-pt1`, `(1)` → appended as `#partN` for distinctness. +- Compound prefixes (`FC2-PPV-123`) handled via secondary regex. +- Search matcher does prefix lookup so `IBW-902` finds both `IBW-902` and `IBW-902#part1` etc. +- Quick search must emit only canonical padded uppercase globs (`ABC-027*`, `ABCDE-1167*`). Do not add `--ignore-case`; user never uses lowercase filenames and it caused noticeable delay. + +## Defaults from earlier sessions + +- `cq:JAV` is the current remote root (after the rclone crypt config change moved it down a level) +- `default_target` in config.json = `["cq:JAV"]` +- `human_size()` formats to 2 decimals (e.g. `6.94 GiB`) +- After the 3-digit ID canonicalization change, run `python rc-jav.py --scan` to rebuild `cache.json` under the new padded keys. +- Duplicate KEEP ranking uses configurable VIP folders before source/size/format ranking. Default VIP folder is `ClearJAV`; video files there are treated as the trusted direct-rip copy. +- Duplicate KEEP ranking treats `.ts` as the lowest-priority video container when any non-`.ts` duplicate is available. + +## Recent decisions / bug fixes + +- `--format json` should keep stdout as clean JSON. Status/progress text belongs on stderr in JSON mode. +- Catalog rows are informational. CSV exports mark them as `CATALOG`; JSON exports put them under `catalog`, not `delete_candidates`. +- Cache loading validates the top-level shape and falls back to an empty cache when `remotes` is missing or malformed. +- The old `--recursive/-R` flag was removed because scans are always recursive (`rclone lsf -R` / quick `lsjson -R`). + +## TODO + +See `TODO.md` for deferred work. + +## When making changes + +- Adding CLI flags: also update host invocation in `D:\DEV\Extensions\Production\rclone-jav\host\rcjav-host.py` if the flag matters to the extension +- Changing `extract_id()` semantics: forces a `--scan` to rebuild cache under new keys, and may need a parallel change in extension's `normalizeId()` +- JSON output format changes: extension's popup.js / overlay rendering reads `structured` array — keep field names stable (`source`, `remote`, `path`, `full_path`, `size`, `size_human`, `mod_time`, `jav_id`) +- Config schema: update `--save` writer and any defaults diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..43c994c --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1 @@ +@AGENTS.md diff --git a/CLAUDE.md.bak b/CLAUDE.md.bak new file mode 100644 index 0000000..3b425fe --- /dev/null +++ b/CLAUDE.md.bak @@ -0,0 +1,59 @@ +# rc-jav (Python CLI) + +Session memory for Claude. Read before making changes here. + +## What this is + +A read-only rclone library comparison + search CLI. Compares `cq:JAV` remote (rclone crypt) against itself (dupe detection) or against external WinCatalog CSV/XML exports. Powers the rclone-jav Brave extension via native messaging. + +## Architecture + +``` +rc-jav.py + ├── reads config.json (default_target etc.) + ├── reads cache.json (per-remote file index, written by --scan) + ├── shells out to: rclone lsf / rclone lsjson / rclone size --json + ├── extract_id() per filename → normalized ID with optional #partN / variant suffix + ├── two query modes: --quick (live rclone --include glob) and cached (uses cache.json) + └── output: rich tables (default) | --basic plain | --format json (for extension) +``` + +## Files + +``` +D:\DEV\Project\rclone-jav\ +├── rc-jav.py single-file CLI +├── config.json default_source/target/catalog (user-editable via --save) +├── cache.json scanned remote file index (written by --scan) +├── wincatalog\ drop WinCatalog CSV/XML exports here (auto-loaded) +├── TODO.md deferred work +└── README.md +``` + +## Companion project + +`D:\DEV\Extensions\Production\rclone-jav\` (PC 1) / `D:\DEV\Extensions\Staging\rclone-jav\` (PC 2) — Brave extension + native messaging host that shells out to `rc-jav.py` for searches. + +## ID normalization + +- `extract_id()` chops trailing single letters (e.g. `IBW-902z.mp4` → `IBW-902`). Decision is intentional — see extension's CLAUDE.md "Decision log". +- Part suffix detection: `_1`, `-pt1`, `(1)` → appended as `#partN` for distinctness. +- Compound prefixes (`FC2-PPV-123`) handled via secondary regex. +- Search matcher does prefix lookup so `IBW-902` finds both `IBW-902` and `IBW-902#part1` etc. + +## Defaults from earlier sessions + +- `cq:JAV` is the current remote root (after the rclone crypt config change moved it down a level) +- `default_target` in config.json = `["cq:JAV"]` +- `human_size()` formats to 2 decimals (e.g. `6.94 GiB`) + +## TODO + +See `TODO.md`. Current item: WinCatalog `\` → `/` path normalization in load_catalog_*. + +## When making changes + +- Adding CLI flags: also update host invocation in `D:\DEV\Extensions\Production\rclone-jav\host\rcjav-host.py` if the flag matters to the extension +- Changing `extract_id()` semantics: forces a `--scan` to rebuild cache under new keys, and may need a parallel change in extension's `normalizeId()` +- JSON output format changes: extension's popup.js / overlay rendering reads `structured` array — keep field names stable (`source`, `remote`, `path`, `full_path`, `size`, `size_human`, `mod_time`, `jav_id`) +- Config schema: update `--save` writer and any defaults diff --git a/README.md b/README.md new file mode 100644 index 0000000..08a99d2 --- /dev/null +++ b/README.md @@ -0,0 +1,233 @@ +# rc-jav + +Read-only duplicate scanner for JAV files across rclone remotes. Groups files by JAV ID (e.g. `SSIS-001`) and reports which copy to keep based on priority rules. + +## Priority rules + +1. Video files inside configured **VIP folders** win first. Default VIP folder: `ClearJAV`. +2. If no VIP-folder video exists, **Source always wins** regardless of resolution/size. +3. `.ts` files rank below other video containers, even when the transport-stream copy is larger. +4. If no Source copy exists in the group, **largest file size wins** among the remaining Targets. +5. Suggestions only — script never deletes. Manual cleanup. + +## ID matching + +Filename stem is matched against: + +- Primary: `^([A-Za-z]+)-(\d+)` — `SSIS-001`, `MIDV-123`, `ABP-456` +- Compound: `^(\w+(?:-\w+)+)-(\d+)` — `FC2-PPV-4894535`, `HEYZO-HD-1234` +- Fallback: `^([A-Za-z0-9]+)-(\d+)` — `1pondo-123`, `carib-456` + +IDs normalized to uppercase with leading zeros stripped from the number (so `ssis-001` == `SSIS-1` == `SSIS-001`). Anything after the ID (` - Actress [1080p]`) is ignored for matching. + +### Part-suffix handling + +Multi-part files (`_1`, `_2`, `-1`, `-2`, `_A`, `_B`, `.1of4`, ` (1)`, `-pt1`, `-part1`, `-cd1`, `-disc1`, trailing ` N`) are normalized as `{ID}#partN` so they do not collide as false duplicates. Searching the base ID still finds all parts. Lettered `_A` / `_B` suffixes become part 1 / part 2. + +Add more suffix shapes with repeatable `--part-pattern` regexes. The first capture group is the part number or one part letter and the pattern runs against the filename stem: + +```powershell +python rc-jav.py --scan --part-pattern '[-_ ]side[-_ ]?(A|B)$' +python rc-jav.py --part-pattern '_([CD])$' --save +``` + +Saved rules live in `config.json` as `part_patterns`. The extension Options page has the same custom part detector list for host-triggered searches, duplicate review, and cache rebuilds. + +Files with no parseable ID are listed under "Skipped" at the end so you can spot misnamed files. + +### Rule checks + +Focused rule tests cover ID extraction, multipart grouping safety, and duplicate KEEP ranking: + +```powershell +python -B -m unittest discover -s tests -v +``` + +## Usage + +``` +python rc-jav.py \ + --source cq:personal-files/ClearJAV/ichika-matsumoto \ + --target cq:personal-files/JAV/TMP \ +``` + +Flags: +- `--source` / `-s REMOTE` — priority remote path. Repeat for multiple. +- `--target` / `-t REMOTE` — non-priority remote path. Repeat for multiple. +- `--format {console,txt,csv,json,all}` — default `console`. Non-console formats write to `--output-dir`. +- `--output-dir DIR` — default `./reports`. +- `--no-color` — disable ANSI colors but keep rich layout (tables, panels). +- `--basic` — plain text output, no rich tables/panels. Progress ticks every 25 files on stderr. Useful for piping or simple terminals. +- `--rclone-bin PATH` — path to rclone executable (default: `rclone` on PATH). Example: `--rclone-bin C:\Programs\rclone\rclone.exe`. +- `--clearjav` — shortcut: sets source = `DEFAULT_SOURCE`, target = `DEFAULT_TARGET`. Equivalent to `--source cq:personal-files/ClearJAV --target cq:personal-files/JAV/TMP`. Combine with `--source`/`--target` to override one side. + +Examples: + +``` +# full library dupe scan, one flag +python rc-jav.py --clearjav + +# same but only check one actress folder against TMP +python rc-jav.py --clearjav --source cq:personal-files/ClearJAV/ichika-matsumoto +``` + +## Search mode + +Check whether a JAV ID already exists in your library before downloading: + +``` +python rc-jav.py --search SSIS-001 +python rc-jav.py --search SSIS-001 --search FC2-PPV-4894535 + +# wildcards (quote to avoid shell glob expansion) +python rc-jav.py --search "IPZZ-*" +python rc-jav.py --search "FC2-PPV-*" +python rc-jav.py --search "SSIS-???" # exact 3-digit numeric +``` + +Wildcard syntax: `*` (any chars) and `?` (one char), case-insensitive. Matches against normalized IDs in the index, including `#partN` suffixes automatically. + +Range syntax: `[N-M]` inclusive both ends. Works inside any prefix. + +``` +python rc-jav.py --search "IPZZ-[820-860]" +python rc-jav.py --search "FC2-PPV-[4894500-4894600]" +python rc-jav.py --search "MIDV-[001-010]" # zero-padding preserved +``` + +Quote in PowerShell/bash so `[...]` reaches Python literally. Reversed ranges (`860-820`) auto-swap. + +With no `--source` / `--target` flags, only `DEFAULT_TARGET` (TMP) is scanned — the typical case for "do I already have this in my unsorted pile?". Pass `--source cq:personal-files/ClearJAV` to also check the priority library. Edit `DEFAULT_SOURCE` / `DEFAULT_TARGET` at the top of the script to change defaults. Remote scans are recursive. + +Exit code: `0` if every query had at least one hit, `1` otherwise — useful for shell automation. + +## Name search (`--name`) + +Substring search against filenames (case-insensitive). Find all files by actress, studio, tag, anything that appears in the filename. + +``` +python rc-jav.py --name Ichika +python rc-jav.py --name "Ichika Matsumoto" +python rc-jav.py --name Ichika --name Yui # OR — files matching either +python rc-jav.py --name "Mat*" # glob wildcard +python rc-jav.py --search IPZZ-860 --name Ichika # both — separate result blocks +``` + +- Multiple `--name` tokens = OR. Use one combined `--name "foo bar"` for AND/exact-substring. +- Matches against the filename stem only (not folder names). +- Auto-routes to **cached** mode because substring globs can't be server-side filtered on most backends. Pass `-q` to force quick anyway (slower). + +### Smart search mode (auto quick / cached) + +The script auto-picks the right execution path per query and prints which one it chose: + +| Query shape | Picked mode | Reason | +|---|---|---| +| Single exact ID (`IPZZ-860`) | quick | live rclone `--include`, ~1–2s even on huge trees | +| Wildcard (`IPZZ-*`, `SSIS-???`) | cached | reliable normalized matching | +| Range (`IPZZ-[820-860]`) | cached | avoids N rclone calls | +| Multiple `--search` flags | cached | warmup amortizes | + +Override: +- `--quick` / `-q` — force live rclone lookup (skips cache). +- `--cache` — force cache (builds it if cold). + +Quick mode never reads or writes the cache. Cache mode honors `--update` and `--no-cache` as before. + +### Cache + +Search mode caches each remote's file list in `./cache.json` next to the script. Subsequent searches are near-instant. + +- First run: scans + writes cache. +- Later runs: reads cache (banner shows `CACHED 14m (154 files)`). +- `--update` / `-u`: force re-scan + overwrite cache for the requested remotes. +- `--no-cache`: bypass cache (no read, no write). +- Stale warning when cache is older than 24h — still used, marked `CACHED-STALE`. +- Ctrl+C during a scan: rclone is terminated, cache for in-flight remote is NOT written. + +Delete `cache.json` to reset everything. + +### Saving defaults (--save) + +Persist `--source`, `--target`, `--catalog`, and/or `--part-pattern` to `config.json` so you don't have to type them every run. + +``` +# set default target +python rc-jav.py --target cq:personal-files/JAV/TMP --save + +# set source + multiple targets at once +python rc-jav.py --source cq:personal-files/ClearJAV ^ + --target cq:personal-files/JAV/TMP ^ + --target cq:personal-files/JAV/SORTED ^ + --save + +# inspect +type config.json +``` + +Only the keys you explicitly pass are written — running `--save --target X` won't wipe a saved `default_source`. Delete `config.json` to reset to the hardcoded defaults at the top of `rc-jav.py`. + +### Scan-only (--scan) + +Refresh the cache without running a search or dupe report — useful for Task Scheduler / cron pre-warming. + +``` +# default: refresh DEFAULT_TARGET (TMP) +python rc-jav.py --scan + +# refresh both source and target +python rc-jav.py --scan --source cq:personal-files/ClearJAV --target cq:personal-files/JAV/TMP + +# nightly via Task Scheduler +schtasks /Create /SC DAILY /ST 03:00 /TN "rc-jav nightly scan" ^ + /TR "python D:\DEV\Project\rclone-jav\rc-jav.py --scan --basic" +``` + +`--scan` always overwrites the cache for the remotes you list. Exit 0 = success, non-zero = rclone failure. + +``` +python rc-jav.py --search MIDV-999 ; if ($LASTEXITCODE -eq 0) { "have it" } else { "download" } +``` + +## WinCatalog integration + +WinCatalog's native `.wcat` format is proprietary, so the script reads its exports instead. + +1. In WinCatalog: **File → Export** → choose **CSV** or **XML**. +2. Save into the `wincatalog/` folder next to the script. All `*.csv` and `*.xml` files there are auto-loaded — drop in as many discs as you want. +3. Run as normal: `python rc-jav.py --search IPZZ-860` +4. Override or add extra paths with `--catalog PATH` (file or folder, repeatable). +5. To change the default folder, edit `DEFAULT_CATALOG` at the top of the script. + +Re-export when your catalog changes; the script re-reads on every run (catalog data is **not** cached — it's already a local file). + +**Role of catalog hits:** +- Search: shown as rows with source label `Catalog`. The disc/volume name is encoded into the path so you know which offline backup holds the file. +- Dupe mode: catalog entries appear in groups for awareness but are **never marked KEEP or DELETE?** — they're offline, can't be touched. A group is only flagged as a dupe when 2+ rclone copies exist. + +**CSV column auto-detection** (case-insensitive, first match wins): +- Name: `Name`, `File Name`, `Filename`, `Title` +- Path: `Path`, `Full Path`, `Location`, `Folder` +- Size: `Size`, `File Size`, `Bytes`, `Size (bytes)` +- Disc: `Disc`, `Disc Name`, `Disc Label`, `Volume`, `Source`, `Catalog`, `Media` + +XML: walks the tree, treats `` / `` nodes inside `` / `` / `` containers, with `` nesting. + +## Requirements + +- Python 3.9+ +- `pip install rich` (used for progress bars + themed output) +- `rclone` on `PATH` with the relevant remotes configured. + +## UI + +- Live per-file progress bar during scans (`rclone size --json` for total, then `rclone lsf --files-only -R --format pst` streamed). +- Banner panel showing run mode + per-remote cache status. +- Rich tables for search hits and duplicate groups. +- `--no-color` for plain output (CI, piping). + +## Roadmap + +- Phase 1 (current): report duplicates + search. +- Phase 2: `--apply` mode that runs `rclone delete` on `DELETE?` candidates behind a confirmation gate. +- Phase 3: resolution-aware tiebreakers, move-to-review folder, scheduled runs. diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..f831295 --- /dev/null +++ b/TODO.md @@ -0,0 +1,9 @@ +# TODO / Deferred work + +## Deferred + +(append below) + +## Completed notes + +- WinCatalog CSV/XML paths are normalized from `\` to `/` during catalog load. diff --git a/config.json b/config.json new file mode 100644 index 0000000..496030f --- /dev/null +++ b/config.json @@ -0,0 +1,5 @@ +{ + "default_target": [ + "cq:JAV" + ] +} \ No newline at end of file diff --git a/fixtures/README.md b/fixtures/README.md new file mode 100644 index 0000000..2a00990 --- /dev/null +++ b/fixtures/README.md @@ -0,0 +1,83 @@ +# Shared JAV ID fixture corpus + +JSON cases shared between the Python `rc-jav.py` CLI and the browser +extension at `D:\DEV\Extensions\Production\rclone-jav\`. Each side +reads the cases relevant to its own extraction surface. + +## Files + +| File | Domain | Consumer | Notes | +|-------------------------------|----------|----------------------------------------|-------| +| `filename-extraction.json` | filename | Python `extract_id(name)` | Has `#partN` expectations for multipart files | +| `query-extraction.json` | query | Extension `content.js` `normalizeId` | Looser context; extension never emits part suffix | +| `shared-normalization.json` | shared | BOTH | Contract: any mismatch here is a bug, not a fixture issue | + +All files share the same shape: + +```json +{ + "version": 1, + "domain": "…", + "description": "…", + "case_schema": { … }, + "cases": [ + { "name": "…", "input": "…", "expected": "…" } + ] +} +``` + +`expected: null` means "no ID should be detected". + +## Running the Python side + +```bash +python fixtures/run.py +``` + +The runner imports `rc-jav.py` in place, exercises `extract_id` against +`filename-extraction.json`, and `normalize_id` against +`shared-normalization.json`. Exit code is non-zero on any failure. + +## Running the extension side + +No automated runner today. `content.js` lives inside an IIFE that the +browser injects into pages, so importing it from Node would require +either an extraction refactor or a duplicated copy of the regex. Until +that lands, treat `query-extraction.json` and `shared-normalization.json` +as the canonical specification: if you touch `ID_RE_DASHED`, +`ID_RE_UNDASHED`, or `BUILTIN_ID_NORMALIZERS` in content.js, eyeball +this corpus and confirm the cases still describe expected behavior. + +## Adding a case + +1. Pick the file matching the surface you're testing. +2. Append a `{ "name", "input", "expected" }` entry. Keep `name` + descriptive — it's the only label shown when the runner fails. +3. If the case exercises a guarantee both sides must honor, add it to + `shared-normalization.json` as well. +4. Run `python fixtures/run.py` to confirm Python still passes. + +## Known cross-side divergences (intentional) + +These are NOT bugs — they reflect the different surfaces each side +extracts from. Recorded here so future contributors don't try to +"fix" them. + +- **`FC2PPV1841460` compact form (no dashes).** The extension's + `BUILTIN_ID_NORMALIZERS` in `content.js` rewrites this to + `FC2-PPV-1841460` when seen in page titles. Python `extract_id` + does NOT — the compact form doesn't realistically appear in + filenames on disk. Hence the case lives in + `query-extraction.json` only, not in `filename-extraction.json` or + `shared-normalization.json`. + +If a case belongs to one side's contract but not the other's, file it +under the specific domain (`filename-` or `query-`) — not under +`shared-`. + +## Ownership + +This directory lives in the Python repo only because the Python repo +is the more stable root. Conceptually it's joint property of both +codebases. Don't add anything Python-specific to the JSON files — keep +them tool-neutral. diff --git a/fixtures/filename-extraction.json b/fixtures/filename-extraction.json new file mode 100644 index 0000000..fc656b6 --- /dev/null +++ b/fixtures/filename-extraction.json @@ -0,0 +1,24 @@ +{ + "version": 1, + "domain": "filename", + "description": "Filename → canonical JAV ID (with optional #partN suffix). Consumed by Python rc-jav.extract_id.", + "case_schema": { + "name": "human label", + "input": "filename including extension", + "expected": "canonical ID (e.g. ABC-001 or ABC-001#part1) or null when no ID present" + }, + "cases": [ + { "name": "plain dashed ID", "input": "ABC-027.mp4", "expected": "ABC-027" }, + { "name": "dashed ID with resolution tag", "input": "SCOP-297 [1080p].mp4", "expected": "SCOP-297" }, + { "name": "bracket-wrapped ID", "input": "[REAL-779].mp4", "expected": "REAL-779" }, + { "name": "bracket-wrapped ID with extra tag", "input": "[SCOP-297] [1080p].mp4", "expected": "SCOP-297" }, + { "name": "no-hyphen fallback", "input": "MVSD312.avi", "expected": "MVSD-312" }, + { "name": "trailing lowercase variant letter", "input": "IBW-902z.mp4", "expected": "IBW-902z" }, + { "name": "multipart _PART suffix", "input": "KV-118 - Aiba Reika_PART1.mp4", "expected": "KV-118#part1" }, + { "name": "multipart _A letter suffix", "input": "KV-118_A.mp4", "expected": "KV-118#part1" }, + { "name": "multipart trailing -N before bracket", "input": "OFJE-195-7 [480p].mp4", "expected": "OFJE-195#part7" }, + { "name": "FC2 PPV plain", "input": "FC2-1841460.mp4", "expected": "FC2-PPV-1841460" }, + { "name": "FC2 PPV explicit", "input": "FC2-PPV-1841460.mp4", "expected": "FC2-PPV-1841460" }, + { "name": "no ID present", "input": "random_video.mp4", "expected": null } + ] +} diff --git a/fixtures/query-extraction.json b/fixtures/query-extraction.json new file mode 100644 index 0000000..25cdcce --- /dev/null +++ b/fixtures/query-extraction.json @@ -0,0 +1,22 @@ +{ + "version": 1, + "domain": "query", + "description": "Page text / title -> canonical JAV ID. Consumed by the browser extension (content.js normalizeId). Difference from filename: looser context (sentences, mixed punctuation, site chrome). Includes forms (e.g. FC2PPV compact) that Python extract_id does NOT handle, by design — see fixtures/README.md.", + "case_schema": { + "name": "human label", + "input": "raw page text", + "expected": "canonical ID without part suffix (extension never emits #partN), or null when no ID found" + }, + "cases": [ + { "name": "title with site chrome", "input": "SSIS-001 — JAV.tube", "expected": "SSIS-001" }, + { "name": "title with description", "input": "Watch SSIS-001 1080p HD Online", "expected": "SSIS-001" }, + { "name": "trailing letter variant", "input": "IBW-902z Full Movie", "expected": "IBW-902" }, + { "name": "no hyphen in title", "input": "MVSD312 stream", "expected": "MVSD-312" }, + { "name": "FC2 PPV compact", "input": "FC2PPV-1841460 — preview", "expected": "FC2-PPV-1841460" }, + { "name": "FC2 plain digits", "input": "FC2-1841460 thumbnail", "expected": "FC2-PPV-1841460" }, + { "name": "FC2-PPV explicit", "input": "FC2-PPV-1841460 Full", "expected": "FC2-PPV-1841460" }, + { "name": "leading zeros preserved", "input": "ABF-042 — sample", "expected": "ABF-042" }, + { "name": "long numeric tail (7 digits)", "input": "BLK-4748520 stream", "expected": "BLK-4748520" }, + { "name": "no ID present", "input": "JAV Database · home", "expected": null } + ] +} diff --git a/fixtures/run.py b/fixtures/run.py new file mode 100644 index 0000000..3f97e68 --- /dev/null +++ b/fixtures/run.py @@ -0,0 +1,70 @@ +"""Run the shared JAV-ID fixture corpus against rc-jav.py. + +Exits non-zero if any fixture case fails. No third-party dependencies. + +Usage: + python fixtures/run.py +""" +from __future__ import annotations + +import importlib.util +import json +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parents[1] +FIXTURES = Path(__file__).resolve().parent + +SPEC = importlib.util.spec_from_file_location("rcjav", ROOT / "rc-jav.py") +RCJAV = importlib.util.module_from_spec(SPEC) +sys.modules[SPEC.name] = RCJAV +SPEC.loader.exec_module(RCJAV) + + +def _load(name: str) -> dict: + with (FIXTURES / name).open("r", encoding="utf-8") as f: + return json.load(f) + + +def _run(label: str, cases: list[dict], fn) -> tuple[int, int]: + passed = 0 + failed = 0 + for case in cases: + got = fn(case["input"]) + if got == case["expected"]: + passed += 1 + else: + failed += 1 + print(f" FAIL [{label}] {case['name']!r}") + print(f" input = {case['input']!r}") + print(f" expected = {case['expected']!r}") + print(f" got = {got!r}") + return passed, failed + + +def main() -> int: + total_passed = 0 + total_failed = 0 + + for filename, fn_name, fn in [ + ("filename-extraction.json", "extract_id", RCJAV.extract_id), + ("shared-normalization.json", "normalize_id", RCJAV.normalize_id), + ]: + doc = _load(filename) + cases = doc.get("cases", []) + print(f"\n{filename} -> rcjav.{fn_name} ({len(cases)} cases)") + p, f = _run(filename, cases, fn) + total_passed += p + total_failed += f + print(f" {p} passed | {f} failed") + + print() + if total_failed: + print(f"FAILED: {total_failed} of {total_passed + total_failed} cases") + return 1 + print(f"OK: all {total_passed} cases passed") + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/fixtures/shared-normalization.json b/fixtures/shared-normalization.json new file mode 100644 index 0000000..7880f5b --- /dev/null +++ b/fixtures/shared-normalization.json @@ -0,0 +1,17 @@ +{ + "version": 1, + "domain": "shared", + "description": "Raw ID forms → canonical form. Both Python (normalize_id) and the extension (content.js normalizeId) MUST agree on these. Mismatch here is a contract bug.", + "case_schema": { + "name": "human label", + "input": "raw ID-bearing token (no path, no extension)", + "expected": "canonical ID" + }, + "cases": [ + { "name": "lowercase prefix uppercased", "input": "abc-027", "expected": "ABC-027" }, + { "name": "FC2 plain -> FC2-PPV", "input": "FC2-1841460", "expected": "FC2-PPV-1841460" }, + { "name": "FC2-PPV explicit preserved", "input": "FC2-PPV-1841460", "expected": "FC2-PPV-1841460" }, + { "name": "leading zeros preserved", "input": "ABF-042", "expected": "ABF-042" }, + { "name": "5-digit numeric segment", "input": "SDDE-12345", "expected": "SDDE-12345" } + ] +} diff --git a/mockups/console-consolidation-claude.html b/mockups/console-consolidation-claude.html new file mode 100644 index 0000000..05ed4c8 --- /dev/null +++ b/mockups/console-consolidation-claude.html @@ -0,0 +1,695 @@ + + + + + rclone-jav consolidation — final converged plan + + + +
+

rclone-jav Consolidation — Final Converged Plan

+
+ + Status: execution in progress. Shipped: steps 1 (Sim Dupe delete), 2 (CSS extraction), 3 (Transfer Assistant delete + Diagnostics replacement), 5 (Recent Activity + Search Troubleshooting → new Debug Tools pane). Pending: steps 6 (options.js split — Cache & Dup Review paired, biggest), 7a (Bulk Check standalone window), 8 (fixtures), 9 (cache contract), 10 (rc-jav.py split), 11 (host fast-path decision). See D:\DEV\Extensions\Production\rclone-jav\AGENTS.md "Console consolidation refactor — execution status" for current state. +
+ +
+
+

✓ Decided

+
    +
  • Console / Settings / Support tri-split
  • +
  • Default landing = Duplicate Review
  • +
  • Status badges on tabs, no dashboard pane
  • +
  • Launcher pattern over toolbox
  • +
  • Keep Ranking nested in Dup Review
  • +
  • Sim Dupe → delete, samples/ HTML harness
  • +
  • Transfer wizard → delete after Diagnostics replacement verified
  • +
  • Bulk ID Check → detached chrome.windows popup, NOT a Console sidebar tab
  • +
  • Inline rule tests stay, standalone benches → Debug
  • +
+
+
+

✓ Shipped

+
    +
  • Step 1: Sim Dupe deleted from popup. samples/sim-dupe.js preserves payload.
  • +
  • Step 2: CSS extracted → options.css. options.html 1179 → 794 lines.
  • +
  • Step 3: Transfer wizard deleted. Diagnostics → Native host registration now shows Extension ID + Copy button.
  • +
  • Step 5: Recent Activity + Search Troubleshooting moved to new Debug Tools pane. Scope verified by code read.
  • +
+
+
+

📋 Pending

+
    +
  • Step 6: options.js split (Cache + Dup Review paired). 3133-line file. Biggest, riskiest.
  • +
  • Step 7a: bulk-check.html standalone + popup launcher.
  • +
  • Steps 8–10: fixtures, cache contract, rc-jav.py split.
  • +
  • Step 11: host fast-path benchmark + narrow/delete decision.
  • +
+
+
+ + +

1. Primary recommended layout

+

Default landing = Duplicate Review (user's most-frequent maintenance workflow). Sidebar tab labels carry live status badges — no dashboard pane needed. Launcher pattern: heavy tools open focused panes, not nested fieldsets.

+ +
+
+ rclone-jav +
+ + +
+
+
+ +
+
+
+

Duplicate Review 27 pending

+

After-upload workflow. Risky groups skipped by default. Keep Ranking lives here as configuration, not in a separate Settings tab.

+
+
+ + +
+
+ +
+ Pending Review + Skipped — Risky + Keep Ranking Rules + Delete History +
+ +
+
+

Filter (this tool only)

+
+ + + +
+

Filters scoped — never exported as global settings.

+
+
+

Delete queue

+
12 files · 47.3 GiB
+

Safety: VIP folders + multipart-risk paths auto-excluded.

+
+
+ +
+

JBD-291 · 2 candidates

+
+
+ /JAV/clearjav/JBD-291 [1080p].mp4 + 4.94 GiB + KEEP +
+
+ /JAV/old/JBD-291.mp4 + 3.82 GiB + +
+
+
+ +
+

OFJE-195 · multipart risk

+
+
+ /JAV/OFJE-195_PART1.mp4 + 2.10 GiB + REVIEW +
+
+ /JAV/OFJE-195_PART2.mp4 + 2.08 GiB + REVIEW +
+
+
+
+
+
+ + +

2. Decision table (refactor spec data)

+

Each current pane mapped to its future home, treatment, ship order, and replacement work (if any). Ship order = execution sequence within phase 3 (UI consolidation). Steps share PR scope where useful.

+ +
+ KEEP visible + CONTEXTUAL (lives with feature) + MOVE (relocate) + DEBUG only + DELETE +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
#Current surfaceFuture homeTreatmentReplacement
1Sim Dupe popup actionsamples/popup-states.html (repo file)DELETENo product replacement. Repo HTML for layout testing only.
2CSS embedded in options.htmlPer-pane .css files alongside per-pane JSEXTRACTNo behavior change. Reduces options.html before JS split.
3Transfer Assistant wizard(gone)DELETEReplacement = Diagnostics 3 actions (see §3 checklist). Delete after verification.
4Cache & Scans paneConsole → Cache & ScansKEEPPaired with Dup Review extraction. State interface shared.
4Duplicate Review paneConsole → Duplicate Review (default landing)KEEPSame PR as Cache & Scans. Reads cache state.
4Keep Ranking Rules paneDuplicate Review → Keep Ranking Rules (sub-tab)CONTEXTUALMoves with Dup Review. Becomes nested sub-tab.
4VIP folders configDuplicate Review → Keep Ranking RulesCONTEXTUALFeature-specific config moves with feature.
5Recent Activity (search/page history)Support → Debug Tools → Search ActivityDEBUGIf audit deletion events also present, split out (pending verification).
5Search TroubleshootingSupport → Debug ToolsDEBUGStandalone bench. No edit locality.
5Page Extraction Test (standalone)Support → Debug ToolsDEBUGInline "Pick Element" variant stays in Site Extraction settings.
5Test ID Extraction (inline)Settings → Matching Rules (collapsible per editor)CONTEXTUALEditor feedback. Stays beside rule it tests.
5Test ID Extraction (standalone bench)Support → Debug ToolsDEBUGSecond row — split from inline version above.
6Library Issues paneConsole → Library IssuesKEEPOwn tab + status badge. Rename UI nested as sub-tab.
6Bulk ID Checkbulk-check.html — detached chrome.windows popupRESHAPERemoved from Console sidebar. Single entry path = popup launcher button → opens 640×540 detached window. Different tool type than Console panes (transient utility, no sidebar context).
7Profiles, Scan Behavior, Overlays, Deletion settingsSettings → (separate sub-tabs)KEEPSettings sub-tabs split into separate JS files.
7Matching Rules / Site ExtractionSettings → (separate sub-tabs, inline tests retained)KEEPCollapsible inline tester beside each rule.
8(new) Shared fixture corpusTop-level fixtures/ (neutral location)NEWContract between extension and Python. Both consume.
9Cache contract designCACHE_VERSION (exists) + ID_RULES_VERSION (new)NEWSchema bump = force rebuild. Rules bump = warn-and-mark-stale.
10rc-jav.py monolithrcjav/ package (ids, cache, dupes, catalog, …)SPLITAfter fixtures + tests + cache contract exist.
11Host fast-path searchNarrow / Delete / Keep — based on §4 benchmarkDECIDEBenchmark idle + under-scan latency first.
+ + +

3. Pre-execution checklists (user handoffs)

+ +
+
+

Diagnostics replacement verification (gates step 3 — Transfer wizard delete)

+
Current extension ID shown as one-line text with copy-to-clipboard button
Replaces wizard's "your extension ID is…" step.
user opens
Diagnostics
+
Button labeled "Re-register host" that triggers register-host.bat path
Replaces wizard's "run this script" step.
user opens
Diagnostics
+
Verification result shown inline within 2s of register click
Replaces wizard's "now check the result" step.
user opens
Diagnostics
+
All three above visible without expanding collapsed sections (one screen)
If buried in expandable cards, write better UI first.
visual
inspection
+
+ +
+

Recent Activity scope test (settles split question)

+
Open Recent Activity. Note current entry types visible.
LIVE search, CACHE search, MATCH, NO_MATCH, NO_ID, page-check, etc.
user
+
Perform a delete in Duplicate Review. Refresh Recent Activity.
Single delete operation, any candidate.
user
+
If delete event appears → audit value exists. Split into Dup Review → Delete History.
If no → single role. Move entire log to Debug Tools.
user reports
+
+
+ +

Cosmetic remaining: popup launcher button label "Bulk Check" vs icon-only. Either works. Default to label until popup row gets crowded.

+ + +

4. Bulk Check — detached window pattern

+

User clarified: Bulk Check is a transient utility, not a persistent Console surface. Doesn't fit sidebar-tab pattern alongside Dup Review / Cache & Scans / Library Issues. Decision: standalone bulk-check.html opened as detached chrome.windows popup, no Console sidebar entry. Single canonical entry path = popup launcher button.

+ +
+
+

Browser-action popup with launcher

+ +
+ +
+

Detached window (640×540) after launch

+ +

Detached window. No tab bar, no address bar. Closes cleanly when done. Sits over browser, stays visible across tab switches.

+
+
+ +
+

Why detached window, not Console tab

+
+
+

Other Console tools (Dup Review, Cache & Scans, Library Issues):

+
    +
  • long workflows, multi-pass
  • +
  • need sidebar context (compare to other tools)
  • +
  • persistent state (review queue, scan job)
  • +
  • fit Options sidebar tab pattern
  • +
+
+
+

Bulk Check:

+
    +
  • short workflow, one-shot
  • +
  • no sidebar context needed
  • +
  • transient state (last-paste persisted, results ephemeral)
  • +
  • fits detached-window pattern
  • +
+
+
+

Different tool type. Treating it like Dup Review was a category error. Single user knows the feature exists — discovery via popup button is enough.

+
+ +
+

Implementation notes

+
// Popup launcher click handler
+chrome.windows.create({
+  url: chrome.runtime.getURL('bulk-check.html'),
+  type: 'popup',
+  width: 640,
+  height: 540
+});
+
    +
  • Window dedup: track open bulk-check window ID in chrome.storage.session. Second launcher click focuses existing window instead of spawning duplicate.
  • +
  • State persistence: last paste saved to chrome.storage.local key bulk_check_last_paste. Reopen restores. Results are ephemeral (re-run on reopen).
  • +
  • Backend reuse: calls native host via same messaging path popup search uses. No new backend code.
  • +
  • No back nav: window can't navigate. User closes when done. Ctrl+W closes the bulk window, not a browser tab.
  • +
+
+ +
+

Edge cases

+
    +
  • Popup auto-closes after launcher click (Chrome behavior). Window survives. Good — that's the intent.
  • +
  • Window positioning unreliable. Chrome treats left/top as hints, multi-monitor users may get the window on the wrong screen. Acceptable for personal-use tool.
  • +
  • Brave / Edge variance. Detached popups behave slightly differently across Chromium forks. Test on user's actual browser before shipping. Fallback if broken: open bulk-check.html in a normal tab via chrome.tabs.create.
  • +
+
+ +
+

Does NOT generalize

+

Detached-window pattern fits Bulk Check because it's transient + no-sidebar-context + short. Doesn't apply to:

+
    +
  • Diagnostics — reference info, lives in sidebar fine
  • +
  • Setup repair button — already inline in Diagnostics, small enough
  • +
  • Dup Review / Cache & Scans / Library Issues — long workflows, sidebar context useful
  • +
  • Settings — set-and-forget, not workflow
  • +
+

One-tool answer, not a pattern across the app.

+
+ + +

5. Execution sequence (final)

+

Codex's revised order (triage first, boundary doc second) with my refinements. Risk and dependencies marked. Steps 1–4 are reversible single-file changes (warmup phase). Steps 5–10 = structural. Step 11 = final architectural call.

+ +
+
1
Per-pane triage — 30 min with user. Decision table above IS this artifact.
zero riskno deps
+
2
Boundary ownership doc — extension extracts query ID, Python owns filename semantics, host adapts. 1 hour, no code.
zero riskafter #1
+
3
Host fast-path benchmark — latency under idle Python AND under scanning Python. Result gates step 11.
measure onlyno deps
+
4
Delete confirmed surfaces — Sim Dupe popup button (no replacement), Transfer wizard (after Diagnostics verification passes).
trivialafter §3 checklist
+
5
CSS extraction from options.html — per-pane CSS files. No behavior change. Bisect-friendly.
lowafter #4
+
6
options.js split: Cache & Dup Review paired — Dup Review reads cache state. Single PR extracts both. Keep Ranking moves with Dup Review.
moderateafter #5
+
7
options.js split: Debug Tools + Library Issues + Settings sub-tabs — remaining Options extractions. Inline test components reused across rule editors. Bulk Check is NOT here — it's a new standalone file (step 7a).
moderateafter #6
+
7a
Create bulk-check.html standalone + popup launcher button — new HTML file, own JS module, no Options dependency. Popup gets one button calling chrome.windows.create. Window dedup + state persistence in chrome.storage.
additiveparallel to #7
+
8
Shared fixture corpus — top-level fixtures/ (neutral). Python and extension both consume.
additiveno blocking
+
9
Cache contract design — CACHE_VERSION (exists) + ID_RULES_VERSION (new). Schema vs semantics, two concepts.
design decisionbefore #10
+
10
rc-jav.py module split — ids.py, cache.py, dupes.py, catalog.py, rclone_io.py, cli.py. Tests pre-exist via #8.
code churnafter #8, #9
+
11
Host narrow / keep / delete — based on #3 benchmark. If under-scan responsiveness depends on host = keep narrow. If not = delete.
behavior changeafter #3, #10
+
+ + +

6. Acceptance criteria template

+

Each step in the sequence needs three things in the final spec: acceptance criterion, rollback procedure, touched-files list. Without these, "ship order N" is a wish not a plan. Template below — fill per step in spec doc.

+ +
+

Template per sequence step

+
step: 6 +title: options.js split — Cache & Dup Review paired + +touched_files: + - options.html (script tag order changes) + - options.js (DELETE: cache section, dup review section, keep ranking section) + - options-cache.js (NEW) + - options-review.js (NEW) + - options-core.js (NEW: shared helpers, pane nav, save/load) + +acceptance: + - Fresh extension reload, options.html opens + - Default landing = Duplicate Review tab + - Cache & Scans tab loads, shows last scan timestamp + - Run Duplicate Review on existing cache — same result set as pre-refactor + - Keep Ranking Rules sub-tab inside Dup Review opens + - No console errors on load or interaction + +rollback: + - git revert <sha> + - No data migration. Cache schema unchanged. Storage keys unchanged. + - Diagnostics-verified replacement of Transfer wizard remains intact (step 3 already shipped).
+
+ + +

7. Out of scope (explicitly rejected)

+
    +
  • Dashboard pane — tab badges replace. Adding a dashboard creates a feature sink.
  • +
  • After-Upload workflow wizard page — sidebar nav order already encodes the workflow.
  • +
  • Matching Lab consolidation page — inline tests cover editor needs, standalone bench in Debug covers diagnostic needs.
  • +
  • Mode switcher top bar (Console / Settings / Support segmented control) — sidebar groups do this.
  • +
  • In-extension Sim Dupe / Debug Preview page — repo HTML file is enough for single-user layout work.
  • +
  • Popup bulk mode toggle — popup stays single-job. Launcher button opens detached window, no inline bulk mode.
  • +
  • Bulk ID Check as Console sidebar tab — wrong tool type for sidebar pattern. Detached window matches its transient nature.
  • +
  • Bulk Check as Options-page deep-link tab — previously considered. Rejected: leaves a leftover tab open after use, Options sidebar adds noise to a one-shot tool.
  • +
  • Frontend framework (React/Vue/Svelte) — vanilla + ordered script files is correct for MV3 + project scale.
  • +
  • Console.log telemetry for usage audit — manual triage of single-user project beats instrumented signals.
  • +
+ +

8. Net position

+

Architecture decided. Three small user handoffs remain (Diagnostics verification, Recent Activity scope check, popup button label). After those, decision table expands into per-step spec with acceptance + rollback. Code work begins on step 1 (smallest, fastest, lowest risk). Total estimated execution span: phased over multiple PRs, no big-bang refactor.

+ +
+ + diff --git a/mockups/console-consolidation-options.html b/mockups/console-consolidation-options.html new file mode 100644 index 0000000..abe511e --- /dev/null +++ b/mockups/console-consolidation-options.html @@ -0,0 +1,355 @@ + + + + + rclone-jav console consolidation direction + + + +
+

rclone-jav Consolidation Direction

+

Updated after the refactor discussion. The page keeps the earlier visual samples, but the decisions are now explicit: launcher-style maintenance console, Duplicate Review as the default work surface, status on navigation instead of a dashboard pane, Bulk ID Check with a popup-launched quick window plus a full Console tool, and debug/testing surfaces pulled out of the daily workflow.

+
+ Frequent maintenance + Set-and-forget settings + Support / debug + Placement decision +
+ +
+
+

1. Recommended Console Shell

Frequent maintenance tools get focused destinations. The navigation carries status instead of a separate dashboard pane.

chosen direction
+
+
rclone-jav Console
+
+ +
+

Duplicate Review

Default landing after uploads. Keep Ranking Rules and delete history stay with the workflow that uses them.

27 pending
+
+

Review Queue

12 ready · 2 risky

Uses VIP folders, multipart safety, keep reasons, and delete queue checks.

+

Contextual Config

Keep Ranking Rules live inside Duplicate Review, not as a distant general setting.

+
+
+

Cache Status Lives In Nav

Cache & Scans owns the scan detail. The sidebar badge is enough while you review dupes.

+

Console Neighbors

Library Issues and Bulk ID Check remain direct tools, not settings fieldsets.

+
+
+
+
+

Chosen: Console / Settings / Support sidebar, Duplicate Review first, no dashboard pane.

+
+ +
+

2. Launcher Treatment

The page should open focused maintenance views instead of collecting every heavy tool as a permanent fieldset.

chosen pattern
+
+
Library Console
+
+
+

Cache Status

Fresh cache · cq:JAV · last scan 28m ago
+

Next Maintenance

Large tools stay out of the page until you open them.

+
+ +
+
+

Chosen for the big surfaces: Duplicate Review, Cache & Scans, Library Issues, and Bulk ID Check.

+
+ +
+

3. Boundary Without A Mode Switcher

Console, Settings, and Support stay distinct through sidebar groups, not a second top-level mode control.

chosen boundary
+
+
rclone-jav
+
+ +
+
+
Console

Maintenance

Review the library repeatedly.

+
Settings

Configure

Profiles, rules, overlays, deletion.

+
Support

Troubleshoot

Diagnostics, debug benches, setup.

+
+
+

Settings mode would look quieter

+
+
+
+
+
+

Chosen conceptually, simplified visually: sidebar groups do the separating work.

+
+ +
+

4. No Workflow Wizard

The maintenance order is real, but it should be encoded by the Console tools themselves rather than another page.

rejected surface
+
+
Console Order
+
+

Maintenance stays obvious

The sidebar and focused tools make the flow clear without adding a separate wizard surface.

+
+
1
Refresh cache
Update changed files from configured roots.
+
2
Review skipped names
Spot files that did not produce an ID.
+
3
Review duplicates
KEEP reasons and multipart-risk skips included.
+
4
Check library issues
Rename bracket/no-hyphen oddities if needed.
+
+
Utility
+
+
+

Rejected as a dedicated home page. Useful order, unnecessary extra destination.

+
+ +
+

5. Rejected: Bulk Mode Inside Popup

This would turn the popup into a two-mode mini-app with cramped result review.

rejected
+ +

Rejected even for 5-20 IDs. The popup gets a doorway into a focused Bulk Check surface, not a permanent second mode.

+
+ +
+

6. Chosen: Bulk Check Quick Window

Typical batches are expected to be about 5-20 IDs, so the popup opens a compact focused window while the Console owns the full tool.

chosen bulk path
+ +

Chosen: popup opens a compact Bulk Check window for short batches. The Console remains the full batch-review surface.

+
+ +
+

7. Debug Split + Repo Preview

Debug history and standalone tests move out of daily workflow. Sim Dupe leaves the extension UI entirely.

chosen support split
+
+
Support / Debug Tools
+
+
+

Debug Tools

+ +
+
+

Repo Preview Harness

Popup state samples live in a repo HTML file such as samples/popup-states.html, not as a hidden extension page.

+
+
+
MATCH · sample popup state
+
BLK-474 - ClearJAV.mp44.94 GiB
BLK-474 [1080p].mp44.90 GiB
+
+
+
+
+

Chosen: standalone support/debug tools remain available; Sim Dupe is removed from extension UI.

+
+ +
+

8. Inline Rule Feedback

Rule editors keep local feedback. Only standalone troubleshooting benches move to Debug Tools.

chosen bench split
+
+
Settings / Matching Rulesfeedback stays nearby
+
+
Custom Part DetectorID NormalizerSite Extraction
+
+
+

Rule

+ +
+
+
+

Inline Feedback

+
+
KV-118_PART1.mp4part 1
+
KV-118_PART2.mp4part 2
+
Covered by built-in?shown
+
+
+
+

Standalone Search Troubleshooting, page extraction testing, and search history can still move into Support. Editing rules should not require leaving the editor to see feedback.

+
+
+

Chosen: contextual inline tests stay. General troubleshooting tools move to Support.

+
+
+
+ + diff --git a/rc-jav.py b/rc-jav.py new file mode 100644 index 0000000..37fc844 --- /dev/null +++ b/rc-jav.py @@ -0,0 +1,2230 @@ +#!/usr/bin/env python3 +"""Scan rclone remotes for duplicate JAV files grouped by ID.""" +from __future__ import annotations + +import argparse +import csv +import fnmatch +import json +import os +import re +import subprocess +import sys +import threading +import time +import xml.etree.ElementTree as ET +from dataclasses import dataclass, asdict +from datetime import datetime +from pathlib import Path +from typing import Iterable + +from rich.console import Console +from rich.panel import Panel +from rich.progress import ( + BarColumn, + MofNCompleteColumn, + Progress, + SpinnerColumn, + TextColumn, + TimeElapsedColumn, + TimeRemainingColumn, +) +from rich.table import Table +from rich.text import Text + +PRIMARY_ID_RE = re.compile(r"^([A-Za-z]+)-(\d+)") +FALLBACK_ID_RE = re.compile(r"^([A-Za-z0-9]+)-(\d+)") +COMPOUND_ID_RE = re.compile(r"^([A-Za-z0-9]+(?:-[A-Za-z0-9]+)+)-(\d+)") + +# Part-suffix patterns: anchored at end of stem (after stripping extension). +# Each pattern's group(1) is the part number. +RANGE_RE = re.compile(r"\[(\d+)-(\d+)\]") + +# Non-anchored XofY probe used in detect_part() to resolve the priority conflict +# between a trailing (N) copy-marker suffix and an embedded XofY part indicator. +# Example: "ENKI-031 [1080p].2of2 (1)" — the (1) is a filesystem collision suffix +# (rclone, Windows copy), not a part number; the 2of2 is the real part indicator. +# This pattern intentionally has no end-anchor so it matches anywhere in the stem. +_XOFY_PRIORITY_RE = re.compile(r"[._ -](\d+)\s*of\s*\d+", re.IGNORECASE) + +BUILTIN_PART_RES = [ + re.compile(r"[-_ ](?:pt|part|cd|disc)[-_ ]?(\d+)$", re.IGNORECASE), + re.compile(r"\s*\((\d+)(?:\s*of\s*\d+)?\)$", re.IGNORECASE), + # Exported multipart filenames often end in `.1of2` / `-2 of 4`. + re.compile(r"[._ -](\d+)\s*of\s*\d+$", re.IGNORECASE), + # Bare numeric suffixes (`_N`, ` N`) are only treated as part numbers when + # the number is 1-2 digits. Wider patterns falsely matched resolution tags + # (`_2160`, `_4K2160`) and dates/years (`SSIS-001 2023.mp4` -> `#part2023`), + # corrupting cache keys. + # Staged detection also retries after resolution/actress cleanup, so end + # anchors can match both raw suffixes and metadata-blocked suffixes safely. + re.compile(r"_(\d{1,2})$"), + # Hyphen short-part suffix after the ID, e.g. OFJE-195-1 [480p].mp4. + # Limit to 1-2 digits so the base ID's usual 3+ digit numeric component + # does not make every canonical `ABC-123` filename look multipart. + re.compile(r"-(\d{1,2})$"), + # Lettered parts: separator (hyphen or underscore) followed by A-D. + # Uppercase only — lowercase letters are variant designators (e.g. IBW-902z) + # and are preserved as part of the base ID, not treated as part numbers. + re.compile(r"[-_]([A-D])$"), + # Bare uppercase letter directly after the ID digits with no separator, + # e.g. BAK-052A, BAK-052B. Lookbehind ensures a digit precedes. + re.compile(r"(?<=\d)([A-D])$"), + re.compile(r"\s+(\d{1,2})$"), +] +PART_RES = list(BUILTIN_PART_RES) + + +def configure_part_patterns(patterns: Iterable[str]) -> list[str]: + """Extend part suffix detection with user regexes whose first group is part number.""" + global PART_RES + PART_RES = list(BUILTIN_PART_RES) + errors: list[str] = [] + for pattern in patterns: + pattern = str(pattern or "").strip() + if not pattern: + continue + try: + compiled = re.compile(pattern, re.IGNORECASE) + except re.error as e: + errors.append(f"{pattern!r}: {e}") + continue + if compiled.groups < 1: + errors.append(f"{pattern!r}: needs a capture group for the part number") + continue + PART_RES.append(compiled) + return errors + + +def detect_part(stem: str) -> str | None: + """Return part number as string if stem ends with a part suffix, else None. + + XofY (e.g. .2of2) anywhere in the stem takes unconditional priority over a + trailing (N) suffix. A file named 'ENKI-031 [1080p].2of2 (1).mp4' is part 2; + the trailing (1) is a filesystem copy-collision marker (rclone / Windows), + not a part number. Without this pre-check the ordered PART_RES list would + match (1) first and misclassify the file as part 1. + """ + m = _XOFY_PRIORITY_RE.search(stem) + if m: + return m.group(1) + for r in PART_RES: + m = r.search(stem) + if m: + return m.group(1) + return None + + +def part_key(part: str) -> str: + token = str(part or "").strip() + if token.isdigit(): + return str(int(token)) + if len(token) == 1 and token.isalpha(): + return str(ord(token.upper()) - ord("A") + 1) + return token.upper() + + +@dataclass +class FileEntry: + source: str # "Source" (priority) or "Target" + remote: str # the rclone remote:path root supplied + path: str # relative path within remote + size: int + mod_time: str + jav_id: str # normalized, e.g. "SSIS-1" + + @property + def full_path(self) -> str: + sep = "" if self.remote.endswith("/") or not self.path else "/" + return f"{self.remote}{sep}{self.path}" + + + +def human_size(n: int) -> str: + nf = float(max(0, n)) + for unit in ("B", "KiB", "MiB", "GiB", "TiB"): + if nf < 1024: + return f"{int(nf)} B" if unit == "B" else f"{nf:.2f} {unit}" + nf /= 1024 + return f"{nf:.2f} PiB" + + +# Matches a trailing lowercase letter variant designator, e.g. the 'z' in IBW-902z. +_VARIANT_SUFFIX_RE = re.compile(r"^(.+?)([a-z])$") + +# Strips `[resolution]` and ` - Actress Name` from a stem so that part-suffix +# patterns anchored at `$` fire correctly. +# Canonical naming: {ID}[-{part}][ - {actress}][ [{resolution}]] +_RESOLUTION_TAG_RE = re.compile(r"\s*\[[^\]]*\]\s*$") + +# Bracket-wrapped ID: [REAL-779] or [HODV-21076] Saki Hatsumi [1080p] +_BRACKET_ID_RE = re.compile(r"^\[([^\]]+)\]") +_RES_LABEL_RE = re.compile(r"\[(?:2160|1080|720|480)p\]", re.IGNORECASE) +_VIDEO_EXTS = { + ".avi", ".flv", ".m2ts", ".m4v", ".mkv", ".mov", ".mp4", ".mpeg", + ".mpg", ".ts", ".webm", ".wmv", +} +_LOWEST_KEEP_PRIORITY_EXTS = {".ts"} + +# No-hyphen ID fallback: MVSD312 → MVSD-312 (letters-only prefix + digits, no hyphen) +_NOHYPHEN_ID_RE = re.compile(r"^([A-Za-z]{2,8})(\d{3,6})") + + +def _clean_stem_for_parts(stem: str) -> str: + """Return stem with trailing [tag] and ' - Actress' stripped. + Resolution is always the last bracketed token; actress follows ' - '.""" + s = _RESOLUTION_TAG_RE.sub("", stem).strip() + if " - " in s: + s = s[:s.index(" - ")].strip() + return s + + +def _part_detection_stems(stem: str) -> list[str]: + """Return stem stages for part detection from least to most cleaned.""" + resolution_clean = _RESOLUTION_TAG_RE.sub("", stem).strip() + actress_clean = _clean_stem_for_parts(stem) + out: list[str] = [] + for candidate in (stem, resolution_clean, actress_clean): + if candidate and candidate not in out: + out.append(candidate) + return out + + +def detect_part_from_stem(stem: str) -> str | None: + """Try part suffix rules before and after metadata cleanup.""" + for candidate in _part_detection_stems(stem): + part = detect_part(candidate) + if part: + return part + return None + + +def extract_id(name: str) -> str | None: + stem = Path(name).stem + + # Strip bracket wrapper: [REAL-779] → REAL-779, [SCOP-297] [1080p] → SCOP-297 + effective_stem = stem + if stem.startswith("["): + bm = _BRACKET_ID_RE.match(stem) + if bm: + effective_stem = bm.group(1).strip() + + m = PRIMARY_ID_RE.match(effective_stem) + if not m: + m = COMPOUND_ID_RE.match(effective_stem) + if not m: + m = FALLBACK_ID_RE.match(effective_stem) + if not m: + # No-hyphen fallback: MVSD312 → MVSD-312 + m = _NOHYPHEN_ID_RE.match(effective_stem) + if not m: + return None + + num = int(m.group(2)) + width = max(3, len(m.group(2))) + prefix = m.group(1).upper() + if prefix == "FC2": + prefix = "FC2-PPV" + + # Check the character immediately after the matched digits. + # Lowercase → variant designator (e.g. IBW-902z): fold into the base ID. + # Uppercase A-D → part letter: handled below by detect_part. + # Anything else (space, '[', end-of-string) → no variant. + after = effective_stem[m.end():m.end() + 1] + variant = after if after.islower() else "" + + base = f"{prefix}-{num:0{width}d}{variant}" + + # Use original stem (not effective_stem) so bracket-wrapped filenames like + # [REAL-779-1].mp4 still get part detection applied to the full stem. + # Run before and after metadata cleanup: raw suffixes such as + # "KV-118 - Actress_PART1" must survive, while trailing [1080p] tags still + # need cleanup before end-anchored detectors can match. + part = detect_part_from_stem(stem) + return f"{base}#part{part_key(part)}" if part else base + + +def normalize_id(raw: str) -> str | None: + return extract_id(raw + ".x") # add dummy ext so stem keeps the ID intact + + +def describe_id_match(display_query: str, matched_query: str, matched_id: str, + expansion_count: int) -> dict[str, str]: + """Explain the matcher path used for one ID hit in JSON output.""" + if "*" in matched_query or "?" in matched_query: + kind, label, confidence = "wildcard", "Wildcard ID", "broad" + elif expansion_count > 1: + kind, label, confidence = "range", "Range member", "expanded" + elif "#part" in matched_query: + kind, label, confidence = "exact_part", "Exact part ID", "high" + elif matched_id.startswith(matched_query + "#part"): + kind, label, confidence = "part", "Base ID + part", "related" + elif display_query.upper() != matched_query.upper(): + kind, label, confidence = "normalized", "Normalized ID", "normalized" + else: + kind, label, confidence = "exact", "Exact ID", "high" + return { + "match_kind": kind, + "match_reason": label, + "match_confidence": confidence, + "matched_query": matched_query, + "matched_id": matched_id, + } + + +def expand_range(raw: str) -> list[str] | None: + """Expand a bracket range like 'IPZZ-[820-860]' into individual ID strings. + Returns None if no range marker present.""" + m = RANGE_RE.search(raw) + if not m: + return None + a, b = int(m.group(1)), int(m.group(2)) + lo, hi = (a, b) if a <= b else (b, a) + width = max(len(m.group(1)), len(m.group(2))) # preserve zero-padding + return [raw[:m.start()] + f"{n:0{width}d}" + raw[m.end():] for n in range(lo, hi + 1)] + + +RCLONE_BIN = "rclone" +BASIC = False # set by --basic +USE_ANSI = True # disabled by --no-color + +# Pre-rich ANSI codes (used in --basic mode for color). +ANSI_RESET = "\033[0m" +ANSI_GREEN = "\033[32m" +ANSI_RED = "\033[31m" +ANSI_YELLOW = "\033[33m" +ANSI_CYAN = "\033[36m" +ANSI_DIM = "\033[2m" +ANSI_BOLD = "\033[1m" + + +def ansi(s: str, code: str) -> str: + return f"{code}{s}{ANSI_RESET}" if USE_ANSI else s +console = Console() # replaced in main() if --no-color + + +_RICH_TAG_RE = re.compile(r"\[/?[^\]]*\]") + + +def strip_markup(s: str) -> str: + return _RICH_TAG_RE.sub("", s) + + +class BasicProgress: + """Minimal stand-in for rich.Progress used when --basic is set.""" + def __init__(self): + self._tasks: dict[int, dict] = {} + self._next = 0 + self._last_print: dict[int, int] = {} + + def __enter__(self): + return self + + def __exit__(self, *exc): + for tid, t in self._tasks.items(): + sys.stderr.write(f"{ansi('[done]', ANSI_GREEN)} {t['desc']} {t['done']}/{t['total']}\n") + return False + + def add_task(self, description: str, total: int = 1) -> int: + tid = self._next + self._next += 1 + desc = strip_markup(description) + self._tasks[tid] = {"desc": desc, "total": total, "done": 0} + self._last_print[tid] = 0 + sys.stderr.write(f"{ansi('[start]', ANSI_CYAN)} {desc}\n") + return tid + + def update(self, tid, total=None, description=None, **_): + t = self._tasks[tid] + if total is not None: + t["total"] = total + if description is not None: + t["desc"] = strip_markup(description) + + def advance(self, tid, n: int = 1): + t = self._tasks[tid] + t["done"] += n + # In-place refresh every 5 files (or every file if total small). + step = 5 if t["total"] > 50 else 1 + if t["done"] - self._last_print[tid] >= step or t["done"] == t["total"]: + counter = ansi(f"{t['done']}/{t['total']}", ANSI_CYAN) + line = f" {counter} {ansi(t['desc'], ANSI_DIM)}" + if sys.stderr.isatty(): + sys.stderr.write(f"\r\033[K{line}") + if t["done"] == t["total"]: + sys.stderr.write("\n") + sys.stderr.flush() + elif t["done"] == t["total"]: + # Non-TTY: only print final line, skip intermediate noise. + sys.stderr.write(line + "\n") + self._last_print[tid] = t["done"] + +# Default remotes used when --search is invoked without explicit --source/--target. +DEFAULT_SOURCE = ["cq:personal-files/ClearJAV"] +DEFAULT_TARGET = ["cq:personal-files/JAV/TMP"] + +# Default WinCatalog export folder (or specific files). Folder entries auto-discover *.csv / *.xml. +DEFAULT_CATALOG: list[str] = [str(Path(__file__).resolve().parent / "wincatalog")] + +# CSV column synonyms (lowercased) — first matching one wins. +CATALOG_COL_NAME = ("name", "file name", "filename", "title") +CATALOG_COL_PATH = ("path", "full path", "location", "folder") +CATALOG_COL_SIZE = ("size", "file size", "bytes", "size (bytes)") +CATALOG_COL_DISC = ("disc", "disc name", "disc label", "volume", "source", "catalog", "media") + +CACHE_PATH = Path(__file__).resolve().parent / "cache.json" +CACHE_VERSION = 3 # bumped: extract_id handles bracket-wrapped IDs + no-hyphen fallback +CACHE_STALE_HOURS = 24 + +DEFAULT_KEEP_RANKING: dict = { + "priority_folders": ["ClearJAV"], + "size_tolerance_mib": 0, + "format_preference": ["mkv", "mp4", "wmv", "avi"], + "tiebreak_res_tag": True, + "tiebreak_longer_name": True, +} +# Module-level ranking config; set from config.json in main() so all call sites pick it up. +_KEEP_RANKING: dict = {} + +CONFIG_PATH = Path(__file__).resolve().parent / "config.json" + +# Written by the native-messaging host when the user clicks Cancel in the +# extension popup. walk_remote checks for it every CANCEL_CHECK_INTERVAL files +# and exits cleanly if found. +CANCEL_FLAG = Path(__file__).resolve().parent / "scan-cancel.flag" +CANCEL_CHECK_INTERVAL = 100 # check / emit progress every N files + + +def load_config() -> dict: + if not CONFIG_PATH.exists(): + return {} + try: + data = json.loads(CONFIG_PATH.read_text(encoding="utf-8")) + if not isinstance(data, dict): + return {} + return data + except (json.JSONDecodeError, OSError): + return {} + + +def save_config(cfg: dict) -> None: + tmp = CONFIG_PATH.with_suffix(CONFIG_PATH.suffix + ".tmp") + tmp.write_text(json.dumps(cfg, indent=2), encoding="utf-8") + os.replace(tmp, CONFIG_PATH) + + +def load_cache() -> dict: + if not CACHE_PATH.exists(): + return {"version": CACHE_VERSION, "remotes": {}} + try: + data = json.loads(CACHE_PATH.read_text(encoding="utf-8")) + if ( + not isinstance(data, dict) + or data.get("version") != CACHE_VERSION + or not isinstance(data.get("remotes"), dict) + ): + if isinstance(data, dict) and "version" in data and data["version"] != CACHE_VERSION: + sys.stderr.write( + f"[warn] cache version mismatch (got {data['version']}, " + f"expected {CACHE_VERSION}); forcing full rescan.\n" + ) + return {"version": CACHE_VERSION, "remotes": {}} + return data + except (json.JSONDecodeError, OSError): + return {"version": CACHE_VERSION, "remotes": {}} + + +def save_cache(cache: dict) -> None: + # Write to a sibling tmp file then atomically replace, so a killed mid-write + # (Ctrl-C, power loss, concurrent --scan) can't leave a half-written + # cache.json — load_cache would otherwise see invalid JSON and fall back to + # an empty cache, forcing a full re-scan. + tmp = CACHE_PATH.with_suffix(CACHE_PATH.suffix + ".tmp") + tmp.write_text(json.dumps(cache, indent=2), encoding="utf-8") + try: + os.replace(tmp, CACHE_PATH) + except PermissionError: + # Windows: destination may be briefly locked by antivirus or a concurrent reader. + time.sleep(0.5) + os.replace(tmp, CACHE_PATH) + + +def cache_age_hours(scanned_at: str) -> float | None: + try: + dt = datetime.fromisoformat(scanned_at.replace("Z", "+00:00")) + except ValueError: + return None + now = datetime.now(dt.tzinfo) if dt.tzinfo else datetime.now() + return (now - dt).total_seconds() / 3600.0 + + +def fmt_age(hours: float) -> str: + if hours < 1: + return f"{int(hours * 60)}m" + if hours < 24: + return f"{hours:.1f}h" + return f"{hours / 24:.1f}d" + + +# ---------- WinCatalog ingest ---------- + +def _pick_col(headers_lower: list[str], synonyms: tuple[str, ...]) -> str | None: + for s in synonyms: + if s in headers_lower: + return s + return None + + +def normalize_catalog_path(path: str) -> str: + """Keep catalog paths display-compatible with rclone-style path consumers.""" + p = (path or "").replace("\\", "/") + if p.startswith("//"): + return "//" + re.sub(r"/+", "/", p[2:]) + return re.sub(r"/+", "/", p) + + +def load_catalog_csv(path: Path, skipped: list[tuple[str, str]]) -> list[FileEntry]: + """Load a WinCatalog CSV export. Lenient about column names.""" + entries: list[FileEntry] = [] + with path.open("r", encoding="utf-8-sig", newline="") as f: + # Sniff delimiter + sample = f.read(4096) + f.seek(0) + try: + dialect = csv.Sniffer().sniff(sample, delimiters=",;\t|") + except csv.Error: + dialect = csv.excel + reader = csv.DictReader(f, dialect=dialect) + if not reader.fieldnames: + return entries + headers: dict[str, str] = {} + for h in reader.fieldnames: + hl = h.lower() + if hl not in headers: + headers[hl] = h + col_name = _pick_col(list(headers), CATALOG_COL_NAME) + col_path = _pick_col(list(headers), CATALOG_COL_PATH) + col_size = _pick_col(list(headers), CATALOG_COL_SIZE) + col_disc = _pick_col(list(headers), CATALOG_COL_DISC) + if not col_name and not col_path: + console.print(f"[yellow]WARN: catalog CSV {path} has no Name/Path columns; skipping.[/]") + return entries + for row in reader: + name = (row.get(headers[col_name]) if col_name else "") or "" + full_path = (row.get(headers[col_path]) if col_path else "") or "" + if not name and full_path: + name = Path(full_path).name + full_path = normalize_catalog_path(full_path) + if not name: + continue + jav_id = extract_id(name) + if not jav_id: + skipped.append((f"catalog:{path.name}", full_path or name)) + continue + try: + size = int(row.get(headers[col_size], 0)) if col_size else 0 + except (ValueError, TypeError): + size = 0 + disc = (row.get(headers[col_disc]) if col_disc else "") or "" + # Encode disc label into "remote" so it surfaces in output. + remote_label = f"catalog:{disc}" if disc else f"catalog:{path.name}" + entries.append(FileEntry( + source="Catalog", remote=remote_label, + path=full_path or name, size=size, mod_time="", + jav_id=jav_id, + )) + return entries + + +def _strip_xml_ns(tag: str) -> str: + """Remove Clark-notation namespace {uri}local → local.""" + return tag.split("}")[-1] if "}" in tag else tag + + +def load_catalog_xml(path: Path, skipped: list[tuple[str, str]]) -> list[FileEntry]: + """Load a WinCatalog XML export. Walks for any element with file-like attrs.""" + entries: list[FileEntry] = [] + tree = ET.parse(str(path)) + root = tree.getroot() + + def walk(node, disc_label: str, parent_path: str, _depth: int = 0): + if _depth > 500: + return + tag = _strip_xml_ns(node.tag).lower() + # Heuristics: disc/catalog/source containers reset disc_label + if tag in ("disc", "catalog", "source", "volume", "media"): + disc_label = node.get("name") or node.get("Name") or disc_label + # File-like nodes + if tag in ("file", "f"): + name = node.get("name") or node.get("Name") or node.findtext("Name") or "" + size_raw = node.get("size") or node.get("Size") or node.findtext("Size") or "0" + try: + size = int(size_raw) + except ValueError: + size = 0 + full_path = normalize_catalog_path(f"{parent_path}/{name}" if parent_path else name) + jav_id = extract_id(name) + if jav_id: + entries.append(FileEntry( + source="Catalog", + remote=f"catalog:{disc_label}" if disc_label else f"catalog:{path.name}", + path=full_path, size=size, mod_time="", jav_id=jav_id, + )) + else: + skipped.append((f"catalog:{disc_label or path.name}", full_path)) + return + # Folder-like: extend parent_path + if tag in ("folder", "dir", "directory"): + folder_name = node.get("name") or node.get("Name") or "" + parent_path = normalize_catalog_path(f"{parent_path}/{folder_name}" if parent_path else folder_name) + for child in node: + walk(child, disc_label, parent_path, _depth + 1) + + walk(root, "", "") + return entries + + +def _expand_catalog_paths(paths: list[str]) -> list[Path]: + """Expand any directories to their *.csv / *.xml children. Files passed through.""" + out: list[Path] = [] + for p in paths: + cp = Path(p) + if cp.is_dir(): + for child in sorted(cp.iterdir()): + if child.suffix.lower() in (".csv", ".xml") and child.is_file(): + out.append(child) + elif cp.exists(): + out.append(cp) + # silently skip missing default dir; warn for everything else + elif Path(p).resolve() not in {Path(d).resolve() for d in DEFAULT_CATALOG}: + console.print(f"[yellow]WARN: catalog path not found: {p}[/]") + return out + + +def load_catalogs(paths: list[str], skipped: list[tuple[str, str]]) -> list[FileEntry]: + out: list[FileEntry] = [] + for cp in _expand_catalog_paths(paths): + ext = cp.suffix.lower() + if ext == ".csv": + out.extend(load_catalog_csv(cp, skipped)) + elif ext == ".xml": + out.extend(load_catalog_xml(cp, skipped)) + else: + console.print(f"[yellow]WARN: unknown catalog format '{ext}' for {cp}; skipping.[/]") + return out + + +# ---------- quick search (no cache) ---------- + +def quick_search_remote(remote: str, source_label: str, + patterns: list[str], + skipped: list[tuple[str, str]]) -> list[FileEntry]: + """Run `rclone lsjson --include ` once per pattern. Bypass cache.""" + out: list[FileEntry] = [] + seen: set[tuple[str, str]] = set() + for pat in patterns: + cmd = [RCLONE_BIN, "lsjson", remote, "--files-only", "-R", "--include", pat] + proc = subprocess.run(cmd, capture_output=True, text=True, + encoding="utf-8", errors="replace") + if proc.returncode != 0: + console.print(f"[red]rclone lsjson --include failed for {remote}:[/]\n{proc.stderr}") + sys.exit(proc.returncode) + for item in json.loads(proc.stdout or "[]"): + if item.get("IsDir"): + continue + path = item["Path"] + key = (remote, path) + if key in seen: + continue + seen.add(key) + jav_id = extract_id(Path(path).name) + if not jav_id: + skipped.append((remote, path)) + continue + out.append(FileEntry( + source=source_label, remote=remote, path=path, + size=int(item.get("Size", 0)), + mod_time=item.get("ModTime", ""), jav_id=jav_id, + )) + return out + + +def choose_search_mode(raw_queries: list[str], force_quick: bool, force_cache: bool) -> tuple[str, str]: + """Decide quick vs cached. Returns (mode, reason).""" + if force_quick and force_cache: + return ("cached", "both --quick and --cache passed; preferring --cache (safer)") + if force_quick: + return ("quick", "forced via --quick") + if force_cache: + return ("cached", "forced via --cache") + if len(raw_queries) > 1: + return ("cached", f"multi-query ({len(raw_queries)} IDs) — cache batches them for free") + if not raw_queries: + return ("cached", "no queries") + q = raw_queries[0] + if RANGE_RE.search(q): + return ("cached", "range [N-M] — too many rclone calls otherwise") + if "*" in q or "?" in q: + return ("cached", "wildcard — cache match semantics are more reliable") + return ("quick", "single exact ID — live lookup is fastest") + + +def _escape_rclone_glob(s: str) -> str: + """Escape rclone filter meta-chars so a literal token isn't interpreted as a + glob. rclone's filter syntax treats `*`, `?`, `[`, `{` specially; brackets + open a char-class that fails silently if the token contains `[` or `]`.""" + out = [] + for ch in s: + if ch in r"*?[]{}\\": + out.append("\\" + ch) + else: + out.append(ch) + return "".join(out) + + +def name_to_include_patterns(tokens: list[str]) -> list[str]: + """Build rclone --include globs for each name token (case-insensitive substring).""" + pats: list[str] = [] + for t in tokens: + if "*" in t or "?" in t: + # Caller-supplied wildcard — assume they meant it. + pats.append(t) + else: + # Literal substring search: escape glob meta inside the token so + # `--name "[BD]"` searches for the literal "[BD]" not a char class. + pats.append(f"*{_escape_rclone_glob(t)}*") + return pats + + +def name_match(stem: str, tokens: list[str]) -> bool: + """Case-insensitive: True if ANY token matches stem (substring or fnmatch glob).""" + low = stem.lower() + for t in tokens: + tl = t.lower() + if "*" in tl or "?" in tl: + if fnmatch.fnmatchcase(low, tl): + return True + elif tl in low: + return True + return False + + +def query_to_include_patterns(raw: str) -> list[str]: + """Turn a search query into one or more rclone --include globs. + Ranges expand to individual IDs; wildcards and exact IDs map to single glob.""" + if RANGE_RE.search(raw): + expanded = expand_range(raw) or [] + out: list[str] = [] + for e in expanded: + out.extend(query_to_include_patterns(e)) + return out + if "*" in raw or "?" in raw: + return [f"{raw}*"] + norm = normalize_id(raw) + if not norm: + return [f"{raw}*"] + prefix, _, digits = norm.rpartition("-") + if not digits.isdigit(): + return [f"{norm}*"] + n = int(digits) + width = max(3, len(str(n))) + return [f"{prefix}-{n:0{width}d}*"] + + +# ---------- rclone wrappers ---------- + +def remote_file_count(remote: str) -> int: + """Fast total file count via `rclone size --json`.""" + cmd = [RCLONE_BIN, "size", "--json", remote] + proc = subprocess.run(cmd, capture_output=True, text=True, + encoding="utf-8", errors="replace") + if proc.returncode != 0: + console.print(f"[red]rclone size failed for {remote}:[/]\n{proc.stderr}") + sys.exit(proc.returncode) + try: + return int(json.loads(proc.stdout).get("count", 0)) + except (json.JSONDecodeError, ValueError): + return 0 + + +DURATION_RE = re.compile(r"^\s*(\d+)\s*([smhd])\s*$", re.IGNORECASE) + + +def parse_duration(s: str) -> str | None: + """Validate a duration suffix (`30m`, `24h`, `7d`, `90s`). Returns the + normalized form rclone accepts, or None if invalid. We don't compute a + timedelta — we pass the suffix straight to rclone --max-age.""" + if not s: + return None + m = DURATION_RE.match(s) + if not m: + return None + return f"{m.group(1)}{m.group(2).lower()}" + + +def walk_remote(remote: str, source_label: str, + skipped: list[tuple[str, str]], + progress: Progress, task_id, + max_age: str | None = None, + _total_override: int | None = None) -> tuple[list[FileEntry], list[str]]: + """Stream files from rclone lsf, ticking progress per file. + If max_age is set, pass --max-age to rclone so only recently-modified files + are returned (incremental scan). + _total_override: skip the internal remote_file_count probe (caller already did it).""" + if max_age: + # Can't pre-count for an age-filtered walk — skip the size probe and + # let progress run on a synthetic total. + total = 0 + progress.update(task_id, total=1, + description=f"[cyan]{source_label}[/] {remote} (since {max_age})") + else: + if _total_override is not None: + total = _total_override + else: + total = remote_file_count(remote) + if BASIC: + # Caller already emitted SCAN_REMOTE_START (without total) — now we know it. + sys.stderr.write("SCAN_REMOTE_COUNTED " + json.dumps({ + "remote": remote, "total": total, + }) + "\n") + sys.stderr.flush() + progress.update(task_id, total=max(total, 1), + description=f"[cyan]{source_label}[/] {remote}") + cmd = [RCLONE_BIN, "lsf", "--files-only", "-R", + "--format", "pst", "--separator", "\t"] + if max_age: + cmd += ["--max-age", max_age] + cmd.append(remote) + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + text=True, encoding="utf-8", errors="replace") + entries: list[FileEntry] = [] + local_skipped: list[str] = [] + if proc.stdout is None: + raise RuntimeError("rclone stdout pipe unexpectedly None") + _stderr_chunks: list[str] = [] + _stderr_thread = threading.Thread( + target=lambda: _stderr_chunks.append(proc.stderr.read() if proc.stderr else ""), + daemon=True, + ) + _stderr_thread.start() + _cancelled = False + try: + for line in proc.stdout: + line = line.rstrip("\n").rstrip("\r") + if not line: + continue + parts = line.split("\t") + if len(parts) < 2: + continue + rel = parts[0] + try: + size = int(parts[1]) + except ValueError: + size = 0 + mod_time = parts[2] if len(parts) >= 3 else "" + jav_id = extract_id(Path(rel).name) + if not jav_id: + local_skipped.append(rel) + skipped.append((remote, rel)) + else: + entries.append(FileEntry( + source=source_label, remote=remote, path=rel, + size=size, mod_time=mod_time, jav_id=jav_id, + )) + progress.advance(task_id) + # Every CANCEL_CHECK_INTERVAL files: check cancel flag and emit progress. + n = len(entries) + len(local_skipped) + if BASIC and n > 0 and n % CANCEL_CHECK_INTERVAL == 0: + if CANCEL_FLAG.exists(): + try: + CANCEL_FLAG.unlink(missing_ok=True) + except OSError: + pass + proc.terminate() + try: + proc.wait(timeout=3) + except subprocess.TimeoutExpired: + proc.kill() + _cancelled = True + break + sys.stderr.write("SCAN_FILE_PROGRESS " + json.dumps({ + "remote": remote, "label": source_label, + "files": len(entries), "skipped": len(local_skipped), + "total": total, + }) + "\n") + sys.stderr.flush() + except KeyboardInterrupt: + proc.terminate() + try: + proc.wait(timeout=3) + except subprocess.TimeoutExpired: + proc.kill() + raise + if _cancelled: + sys.stderr.write("SCAN_CANCELLED\n") + sys.stderr.flush() + sys.exit(0) + proc.wait() + _stderr_thread.join() + if proc.returncode != 0: + err = _stderr_chunks[0] if _stderr_chunks else "" + console.print(f"[red]rclone lsf failed for {remote}:[/]\n{err}") + sys.exit(proc.returncode) + return entries, local_skipped + + +def make_progress(): + if BASIC: + return BasicProgress() + return Progress( + SpinnerColumn(), + TextColumn("{task.description}"), + BarColumn(), + MofNCompleteColumn(), + TimeElapsedColumn(), + TextColumn("eta"), + TimeRemainingColumn(), + console=console, + transient=False, + ) + + +# ---------- collectors ---------- + +def collect_with_progress(remotes_by_label: list[tuple[str, str]], + skipped: list[tuple[str, str]] + ) -> list[FileEntry]: + """Dupe-mode collect — every remote freshly walked with progress.""" + out: list[FileEntry] = [] + if not remotes_by_label: + return out + with make_progress() as progress: + tasks = {(label, r): progress.add_task(f"{label} {r}", total=1) + for label, r in remotes_by_label} + for (label, r), tid in tasks.items(): + entries, _ = walk_remote(r, label, skipped, progress, tid) + out.extend(entries) + return out + + +def cached_collect(remotes: list[str], source_label: str, + skipped: list[tuple[str, str]], + cache: dict, use_cache: bool, force_update: bool, + cache_meta: dict[str, dict], + scan_since: str | None = None) -> list[FileEntry]: + """Search-mode collect with cache. Always recursive. + scan_since: rclone duration string (`24h`, `7d`). When set during a forced + update, only files modified within the window are walked and merged on top + of the existing cache entry; files older than the window keep their cached + record. If there's no prior cache entry for a remote, falls through to a + full scan.""" + out: list[FileEntry] = [] + to_scan: list[str] = [] + to_incremental: list[tuple[str, dict]] = [] # (remote, existing_entry) + for r in remotes: + if scan_since and force_update and use_cache: + existing = cache["remotes"].get(r) + if existing: + to_incremental.append((r, existing)) + continue + # No prior cache for this remote -> can't be incremental, fall back. + entry = cache["remotes"].get(r) if use_cache and not force_update else None + if entry: + age = cache_age_hours(entry["scanned_at"]) + age_str = fmt_age(age) if age is not None else "?" + stale = age is not None and age > CACHE_STALE_HOURS + cache_meta[r] = {"cached": True, "age": age_str, "stale": stale, + "file_count": len(entry["files"])} + for f in entry["files"]: + out.append(FileEntry(source=source_label, remote=r, path=f["path"], + size=f["size"], mod_time=f.get("mod_time", ""), + jav_id=f["jav_id"])) + for s in entry.get("skipped", []): + skipped.append((r, s)) + else: + to_scan.append(r) + + if to_scan: + with make_progress() as progress: + tids = {r: progress.add_task(f"{source_label} {r}", total=1) for r in to_scan} + for r_idx, r in enumerate(to_scan): + _total: int | None = None + if BASIC: + # Emit SCAN_REMOTE_START immediately so the UI shows the remote name. + # Then probe the file count; once known, emit SCAN_REMOTE_COUNTED so + # the UI can show "N / total" without waiting for the first 100 files. + sys.stderr.write("SCAN_REMOTE_START " + json.dumps({ + "remote": r, "label": source_label, + "index": r_idx + 1, "of": len(to_scan), + "total": None, + }) + "\n") + sys.stderr.flush() + _total = remote_file_count(r) + sys.stderr.write("SCAN_REMOTE_COUNTED " + json.dumps({ + "remote": r, "total": _total, + }) + "\n") + sys.stderr.flush() + fresh, local_skipped = walk_remote(r, source_label, skipped, progress, tids[r], + _total_override=_total) + out.extend(fresh) + cache_meta[r] = {"cached": False, "age": "fresh", "stale": False, + "file_count": len(fresh)} + if use_cache: + cache["remotes"][r] = { + "scanned_at": datetime.now().astimezone().isoformat(), + "recursive": True, + "files": [{"path": e.path, "size": e.size, "mod_time": e.mod_time, + "jav_id": e.jav_id} for e in fresh], + "skipped": local_skipped, + } + if BASIC: + sys.stderr.write("SCAN_PROGRESS " + json.dumps({ + "remote": r, "label": source_label, + "files": len(fresh), "files_total": len(out), + }) + "\n") + sys.stderr.flush() + + if to_incremental: + with make_progress() as progress: + tids = {r: progress.add_task(f"{source_label} {r} (since {scan_since})", total=1) + for r, _ in to_incremental} + for r_idx, (r, existing) in enumerate(to_incremental): + if BASIC: + sys.stderr.write("SCAN_REMOTE_START " + json.dumps({ + "remote": r, "label": source_label, + "index": r_idx + 1, "of": len(to_incremental), + "total": None, "incremental": True, + }) + "\n") + sys.stderr.flush() + fresh, local_skipped = walk_remote( + r, source_label, skipped, progress, tids[r], max_age=scan_since, + ) + # Merge: replace entries at paths we just walked, keep all others. + new_paths = {e.path for e in fresh} + old_files = [f for f in existing.get("files", []) + if f["path"] not in new_paths] + merged_files = old_files + [ + {"path": e.path, "size": e.size, "mod_time": e.mod_time, + "jav_id": e.jav_id} for e in fresh + ] + # Merge skipped lists (de-dupe). + old_skipped = set(existing.get("skipped", [])) + old_skipped.update(local_skipped) + # Emit FileEntry for everything (old + new) so the caller sees the + # full set, not just deltas. + for f in merged_files: + out.append(FileEntry(source=source_label, remote=r, path=f["path"], + size=f["size"], mod_time=f.get("mod_time", ""), + jav_id=f["jav_id"])) + for s in old_skipped: + skipped.append((r, s)) + cache_meta[r] = { + "cached": False, "age": f"incremental {scan_since}", + "stale": False, "file_count": len(merged_files), + "added_or_updated": len(fresh), + } + if use_cache: + cache["remotes"][r] = { + "scanned_at": datetime.now().astimezone().isoformat(), + "recursive": True, + "files": merged_files, + "skipped": sorted(old_skipped), + } + if BASIC: + sys.stderr.write("SCAN_PROGRESS " + json.dumps({ + "remote": r, "label": source_label, + "files": len(fresh), "files_total": len(out), + "incremental": True, + "file_count": len(merged_files), + }) + "\n") + sys.stderr.flush() + return out + + +# ---------- renderers ---------- + +def render_banner(cache_meta: dict[str, dict], mode: str) -> Panel: + lines: list[Text] = [] + lines.append(Text.from_markup(f"[bold]mode:[/] {mode}")) + if cache_meta: + for r, m in cache_meta.items(): + if m["cached"]: + tag = f"CACHED {m['age']}" + (" STALE" if m["stale"] else "") + style = "yellow" if m["stale"] else "dim" + else: + tag = "FRESH SCAN" + style = "green" + lines.append(Text.from_markup( + f" [white]{r}[/] [{style}]{tag}[/] [dim]({m['file_count']} files)[/]" + )) + body = Text("\n").join(lines) + return Panel(body, title="rc-jav", title_align="left", border_style="blue") + + +def render_search(matches: dict[str, list[FileEntry]], queries: list[str], + cache_meta: dict[str, dict]) -> None: + console.print(render_banner(cache_meta, mode="search")) + for q in queries: + hits = matches.get(q, []) + if not hits: + console.print(f"[bold red][{q}] NOT FOUND[/]") + console.print() + continue + title = f"[bold green][{q}] {len(hits)} hit(s)[/]" + tbl = Table(title=title, title_justify="left", show_lines=False, + border_style="green", expand=True) + tbl.add_column("Source", style="yellow", no_wrap=True) + tbl.add_column("Cache", no_wrap=True) + tbl.add_column("File", style="bold", overflow="fold") + tbl.add_column("Size", justify="right", no_wrap=True) + tbl.add_column("Path", style="dim", overflow="fold") + for e in sorted(hits, key=lambda x: (x.jav_id, x.path.lower())): + meta = cache_meta.get(e.remote, {}) + if meta.get("cached"): + cache_tag = "[yellow][CACHED-STALE][/]" if meta.get("stale") else "[dim][CACHED][/]" + else: + cache_tag = "[green][FRESH][/]" + tbl.add_row( + e.source, cache_tag, Path(e.path).name, + f"{human_size(e.size)}\n[dim]({e.size:,} B)[/]", + e.full_path, + ) + console.print(tbl) + console.print() + + +def render_name_matches(hits: list[FileEntry], tokens: list[str], + cache_meta: dict[str, dict]) -> None: + title = f"[bold green]Name match {tokens} — {len(hits)} hit(s)[/]" + if not hits: + console.print(f"[bold red]Name match {tokens} — NOT FOUND[/]") + return + tbl = Table(title=title, title_justify="left", show_lines=False, + border_style="green", expand=True) + tbl.add_column("Source", style="yellow", no_wrap=True) + tbl.add_column("Cache", no_wrap=True) + tbl.add_column("ID", style="bold cyan", no_wrap=True) + tbl.add_column("File", style="bold", overflow="fold") + tbl.add_column("Size", justify="right", no_wrap=True) + tbl.add_column("Path", style="dim", overflow="fold") + for e in sorted(hits, key=lambda x: (x.jav_id, x.path.lower())): + meta = cache_meta.get(e.remote, {}) + if meta.get("cached"): + cache_tag = "[yellow][CACHED-STALE][/]" if meta.get("stale") else "[dim][CACHED][/]" + else: + cache_tag = "[green][FRESH][/]" + tbl.add_row( + e.source, cache_tag, e.jav_id, Path(e.path).name, + f"{human_size(e.size)}\n[dim]({e.size:,} B)[/]", + e.full_path, + ) + console.print(tbl) + console.print() + + +def render_name_matches_plain(hits: list[FileEntry], tokens: list[str], + cache_meta: dict[str, dict]) -> str: + lines: list[str] = [] + if not hits: + lines.append(ansi(f"Name match {tokens} — NOT FOUND", ANSI_RED)) + return "\n".join(lines) + lines.append(ansi(f"Name match {tokens} — {len(hits)} hit(s)", ANSI_GREEN + ANSI_BOLD)) + for e in sorted(hits, key=lambda x: (x.jav_id, x.path.lower())): + meta = cache_meta.get(e.remote, {}) + if meta.get("cached"): + tag = ansi("[CACHED-STALE]", ANSI_YELLOW) if meta.get("stale") else ansi("[CACHED]", ANSI_DIM) + else: + tag = ansi("[FRESH]", ANSI_GREEN) + src = ansi(e.source, ANSI_YELLOW) + lines.append(f" {src} {tag} {ansi(e.jav_id, ANSI_CYAN)}") + lines.append(ansi(f" file: {Path(e.path).name}", ANSI_BOLD)) + lines.append(f" size: {human_size(e.size)} ({e.size:,} bytes)") + lines.append(ansi(f" path: {e.full_path}", ANSI_DIM)) + return "\n".join(lines) + + +def render_dupes(dupes: dict[str, list[FileEntry]], + skipped: list[tuple[str, str]], + variant_alerts: dict[str, list[FileEntry]] | None = None) -> None: + if not dupes: + console.print(Panel("[bold green]No duplicates found.[/]", + border_style="green")) + else: + console.print(f"[bold]Found {len(dupes)} duplicate ID group(s):[/]") + console.print() + total_reclaim = 0 + for jav_id in sorted(dupes): + entries = dupes[jav_id] + keep = decide_keep(entries) + tbl = Table(title=f"[bold][{jav_id}][/]", title_justify="left", + show_lines=False, border_style="magenta", expand=True) + tbl.add_column("Action", no_wrap=True) + tbl.add_column("Source", style="yellow", no_wrap=True) + tbl.add_column("Size", justify="right", no_wrap=True) + tbl.add_column("Path", overflow="fold") + for e in sorted(entries, key=lambda x: (x.source != "Source", x.source == "Catalog", -x.size)): + if e.source == "Catalog": + action = "[cyan]CATALOG[/]" + elif e is keep: + action = "[green]KEEP[/]" + else: + action = "[red]DELETE?[/]" + total_reclaim += e.size + tbl.add_row(action, e.source, + f"{human_size(e.size)}\n[dim]({e.size:,} B)[/]", + e.full_path) + console.print(tbl) + console.print() + console.print(Panel( + f"[bold]Potential space reclaim if all DELETE? removed: " + f"[red]{human_size(total_reclaim)}[/][/]", + border_style="red")) + if skipped: + console.print() + tbl = Table(title=f"[dim]Skipped {len(skipped)} file(s) with no parseable ID[/]", + title_justify="left", show_lines=False, border_style="dim", expand=True) + tbl.add_column("Remote", style="dim", no_wrap=True) + tbl.add_column("Path", style="dim", overflow="fold") + for remote, path in skipped[:50]: + tbl.add_row(remote, path) + if len(skipped) > 50: + tbl.add_row("[dim]…[/]", f"[dim]+{len(skipped) - 50} more[/]") + console.print(tbl) + if variant_alerts: + console.print() + console.print(Panel( + f"[bold yellow]⚠ {len(variant_alerts)} variant alert(s) — manual review recommended[/]", + border_style="yellow")) + for bare_id, entries in sorted(variant_alerts.items()): + tbl = Table(title=f"[bold yellow][{bare_id}] — bare + variant coexist[/]", + title_justify="left", show_lines=False, border_style="yellow", expand=True) + tbl.add_column("ID", style="yellow", no_wrap=True) + tbl.add_column("Size", justify="right", no_wrap=True) + tbl.add_column("Path", overflow="fold") + for e in sorted(entries, key=lambda x: x.full_path): + eid = extract_id(Path(e.path).name) or e.jav_id + tbl.add_row(eid, human_size(e.size), e.full_path) + console.print(tbl) + console.print() + + +def decide_keep_with_reason(entries: list[FileEntry]) -> tuple[FileEntry, dict[str, str]]: + """Pick KEEP candidate and explain the first ranking rule that settled it. + + Catalog entries are excluded — they are offline/informational. + + Ranking (descending priority, configurable via keep_ranking in config.json): + 1. Video files in ordered priority folders outrank other rclone entries. + 2. Source entries outrank Target entries when no priority-folder video exists. + 3. Non-.ts files outrank .ts files when a duplicate group has both. + 4. Largest file size. If sizes are within size_tolerance_mib, treated as equal + and format preference is consulted instead. + 5. Format preference: ordered list of extensions (e.g. mkv > mp4 > wmv > avi). + 6. Tie-break: has resolution tag in filename ([1080p], [2160p], [720p], [480p]). + 7. Tie-break: longer filename (more metadata = more descriptive). + """ + ranking = _KEEP_RANKING or {} + tolerance_bytes = int(float(ranking.get("size_tolerance_mib") or 0) * 1024 * 1024) + priority_folders: list[str] = [ + str(folder).strip() for folder in + (ranking.get("priority_folders") or DEFAULT_KEEP_RANKING["priority_folders"]) + if str(folder).strip() + ] + fmt_order: list[str] = list( + ranking.get("format_preference") or DEFAULT_KEEP_RANKING["format_preference"] + ) + use_res_tag: bool = ranking.get("tiebreak_res_tag", True) + use_longer_name: bool = ranking.get("tiebreak_longer_name", True) + + rclone = [e for e in entries if e.source != "Catalog"] + + def _priority_folder_rank(e: FileEntry) -> int | None: + if Path(e.path).suffix.lower() not in _VIDEO_EXTS: + return None + # A root can be cq:JAV while the favored folder is a child path, or the + # supplied root can itself end in that folder. Match across full_path. + full_path = e.full_path.replace("\\", "/").strip("/").lower() + segments = [segment for segment in full_path.split("/") if segment] + for index, raw_folder in enumerate(priority_folders): + folder = raw_folder.replace("\\", "/").strip("/").lower() + if not folder: + continue + if "/" in folder or ":" in folder: + framed = f"/{full_path}/" + if full_path == folder or full_path.startswith(folder + "/") or f"/{folder}/" in framed: + return index + elif folder in segments: + return index + return None + + prioritized = [(rank, e) for e in rclone if (rank := _priority_folder_rank(e)) is not None] + best_priority = min((rank for rank, _ in prioritized), default=None) + priority_videos = [e for rank, e in prioritized if rank == best_priority] + pool_priority = [e for e in rclone if e.source == "Source"] + reason = {"code": "fallback", "summary": "First remaining duplicate candidate"} + if priority_videos: + pool = priority_videos + reason = { + "code": "vip_folder", + "summary": f"VIP folder: {priority_folders[best_priority]}", + } + elif pool_priority: + pool = pool_priority + reason = {"code": "source", "summary": "Source copy outranks target copies"} + else: + pool = rclone if rclone else entries + + # Transport streams often inflate size without being the better keeper. + preferred_containers = [ + e for e in pool if Path(e.path).suffix.lower() not in _LOWEST_KEEP_PRIORITY_EXTS + ] + if preferred_containers and len(preferred_containers) != len(pool): + pool = preferred_containers + reason = {"code": "container", "summary": "Non-TS video outranks transport stream"} + + # Step 1: narrow to within size tolerance of the maximum + max_size = max(e.size for e in pool) + candidates = [e for e in pool if max_size - e.size <= tolerance_bytes] + + if len(candidates) == 1: + if len(pool) > 1 and reason["code"] not in {"vip_folder", "source", "container"}: + reason = {"code": "size", "summary": "Largest file after ranking rules"} + return candidates[0], reason + + # Step 2: format preference (lower index in fmt_order = higher priority) + def _fmt_rank(e: FileEntry) -> int: + ext = Path(e.path).suffix.lower().lstrip(".") + try: + return fmt_order.index(ext) # lower = better + except ValueError: + return len(fmt_order) # unknown = lowest + + best_fmt = min(_fmt_rank(e) for e in candidates) + by_fmt = [e for e in candidates if _fmt_rank(e) == best_fmt] + if len(by_fmt) != len(candidates): + ext = Path(by_fmt[0].path).suffix.lower().lstrip(".").upper() or "preferred format" + reason = {"code": "format", "summary": f"Format preference: {ext}"} + candidates = by_fmt + + if len(candidates) == 1: + return candidates[0], reason + + # Step 3: resolution tag tie-break + if use_res_tag: + tagged = [e for e in candidates if _RES_LABEL_RE.search(Path(e.path).name)] + if tagged: + if len(tagged) != len(candidates): + reason = {"code": "resolution_tag", "summary": "Filename has a resolution tag"} + candidates = tagged + + if len(candidates) == 1: + return candidates[0], reason + + # Step 4: longer filename tie-break + if use_longer_name: + keep = max(candidates, key=lambda e: len(Path(e.path).name)) + return keep, {"code": "filename", "summary": "Longer filename tie-break"} + + return candidates[0], reason + + +def decide_keep(entries: list[FileEntry]) -> FileEntry: + """Pick KEEP candidate for duplicate output.""" + return decide_keep_with_reason(entries)[0] + + +def find_dupes(entries: Iterable[FileEntry]) -> dict[str, list[FileEntry]]: + """Group entries by jav_id. A group is a dupe only if it has >=2 non-Catalog entries.""" + groups: dict[str, list[FileEntry]] = {} + for e in entries: + # Re-evaluate duplicate keys from the current filename rules. Cached + # entries may predate a new part detector such as `.1of2`; treating those + # stale base IDs as duplicate files would produce risky delete hints. + key = extract_id(Path(e.path).name) or e.jav_id + groups.setdefault(key, []).append(e) + out: dict[str, list[FileEntry]] = {} + for k, v in groups.items(): + rclone_count = sum(1 for e in v if e.source != "Catalog") + if rclone_count >= 2: + out[k] = v + return out + + +_SUSPICIOUS_MULTIPART_TAIL_RE = re.compile( + r"(?:^|[-_.\s])(?:p|pt|part|cd|disc|ep|episode|vol|volume|scene)[-_.\s]*([a-d]|\d{1,2})(?:$|[-_.\s\[])" + r"|(?:^|[-_.\s])([a-d]|\d{1,2})(?:$|\s*\[)", + re.IGNORECASE, +) + + +def describe_dupe_risks(jav_id: str, entries: list[FileEntry]) -> list[dict[str, str]]: + """Flag duplicate groups that deserve manual review before deletion.""" + rclone = [e for e in entries if e.source != "Catalog"] + risks: list[dict[str, str]] = [] + if "#part" not in jav_id and len(rclone) >= 3: + risks.append({ + "code": "large_same_id_group", + "summary": f"{len(rclone)} files share this base ID; review for unrecognized parts before deleting.", + }) + + suspicious: list[str] = [] + for e in rclone: + stem = Path(e.path).stem + base_match = PRIMARY_ID_RE.match(stem) or COMPOUND_ID_RE.match(stem) or FALLBACK_ID_RE.match(stem) + if not base_match: + continue + tail = _RESOLUTION_TAG_RE.sub("", stem[base_match.end():]).strip() + if _SUSPICIOUS_MULTIPART_TAIL_RE.search(tail): + suspicious.append(Path(e.path).name) + if suspicious and "#part" not in jav_id: + samples = ", ".join(suspicious[:3]) + more = " ..." if len(suspicious) > 3 else "" + risks.append({ + "code": "part_like_suffix", + "summary": f"Part-like suffixes still share the base ID: {samples}{more}", + }) + return risks + + +def find_variant_alerts( + entries: Iterable[FileEntry], +) -> dict[str, list[FileEntry]]: + """Detect IDs where a bare form and a lowercase-variant form coexist. + + Example: both ``IBW-902.mp4`` and ``IBW-902z.mp4`` are present. + They are different products — not dupes — but their coexistence is + suspicious and warrants manual comparison. + + Returns {bare_id: [all entries whose re-evaluated ID matches bare or variant]}. + Only bare IDs that have at least one variant sibling are included. + """ + index: dict[str, list[FileEntry]] = {} + for e in entries: + key = extract_id(Path(e.path).name) or e.jav_id + index.setdefault(key, []).append(e) + + alerts: dict[str, list[FileEntry]] = {} + for jav_id in index: + if "#" in jav_id: + continue # skip multipart IDs + m = _VARIANT_SUFFIX_RE.match(jav_id) + if not m: + continue + bare = m.group(1) + if bare in index: + # Merge bare + variant entries under the bare key. + if bare not in alerts: + alerts[bare] = list(index[bare]) + alerts[bare].extend(index[jav_id]) + return alerts + + +# ---------- library issues (non-canonical filenames) ---------- + +def _bracket_to_canonical(filename: str) -> str: + """[REAL-779].mp4 → REAL-779.mp4 | [HODV-21076] Saki [1080p].mkv → HODV-21076 Saki [1080p].mkv""" + stem = Path(filename).stem + suffix = Path(filename).suffix + bm = _BRACKET_ID_RE.match(stem) + if not bm: + return filename + inner = bm.group(1).strip() + rest = stem[bm.end():].strip() + new_stem = f"{inner} {rest}".strip() if rest else inner + return f"{new_stem}{suffix}" + + +def _nohyphen_to_canonical(filename: str) -> str: + """MVSD312 [576p].avi → MVSD-312 [576p].avi""" + stem = Path(filename).stem + suffix = Path(filename).suffix + m = _NOHYPHEN_ID_RE.match(stem) + if not m: + return filename + prefix = m.group(1).upper() + num_str = m.group(2) + rest = stem[m.end():] + return f"{prefix}-{num_str}{rest}{suffix}" + + +def find_library_issues(cache: dict) -> dict: + """Scan cache for files with non-canonical names. + + Returns: + {"bracket_names": [...], "nohyphen_names": [...]} + Each entry: {remote, path, size, mod_time, jav_id, canonical_name, issue} + """ + bracket: list[dict] = [] + nohyphen: list[dict] = [] + for remote, remote_data in cache.get("remotes", {}).items(): + for f in remote_data.get("files", []): + fname = Path(f["path"]).name + stem = Path(fname).stem + if stem.startswith("[") and _BRACKET_ID_RE.match(stem): + bracket.append({ + "remote": remote, + "path": f["path"], + "size": f.get("size", 0), + "size_human": human_size(f.get("size", 0)), + "mod_time": f.get("mod_time", ""), + "jav_id": f.get("jav_id", ""), + "canonical_name": _bracket_to_canonical(fname), + "issue": "bracket_id", + }) + elif (not PRIMARY_ID_RE.match(stem) + and not COMPOUND_ID_RE.match(stem) + and not FALLBACK_ID_RE.match(stem) + and _NOHYPHEN_ID_RE.match(stem)): + nohyphen.append({ + "remote": remote, + "path": f["path"], + "size": f.get("size", 0), + "size_human": human_size(f.get("size", 0)), + "mod_time": f.get("mod_time", ""), + "jav_id": f.get("jav_id", ""), + "canonical_name": _nohyphen_to_canonical(fname), + "issue": "nohyphen_id", + }) + return {"bracket_names": bracket, "nohyphen_names": nohyphen} + + +def rename_file_in_remote( + remote: str, + old_rel_path: str, + new_rel_path: str, + cache: dict, + rclone_bin: str = "rclone", + save: bool = True, +) -> dict: + """Rename one file via rclone moveto and patch cache.json. + + Returns {"ok": True, "old_path": ..., "new_path": ...} + or {"ok": False, "error": ..., "conflict": bool} + + Pass save=False when batching — caller is responsible for calling save_cache() once. + """ + sep = "" if remote.endswith("/") else "/" + old_full = f"{remote}{sep}{old_rel_path}" + new_full = f"{remote}{sep}{new_rel_path}" + + # Collision check — does target already exist? + check = subprocess.run( + [rclone_bin, "lsf", new_full], + capture_output=True, text=True, + ) + if check.returncode == 0 and check.stdout.strip(): + return {"ok": False, "error": f"Target already exists: {new_full}", "conflict": True} + + # Perform rename + result = subprocess.run( + [rclone_bin, "moveto", old_full, new_full], + capture_output=True, text=True, + ) + if result.returncode != 0: + return {"ok": False, "error": (result.stderr or result.stdout).strip(), "conflict": False} + + # Patch cache — update path + jav_id for the renamed entry + remote_data = cache.get("remotes", {}).get(remote) + if remote_data: + for f in remote_data.get("files", []): + if f["path"] == old_rel_path: + f["path"] = new_rel_path + f["jav_id"] = extract_id(Path(new_rel_path).name) or f["jav_id"] + break + remote_data["skipped"] = [s for s in remote_data.get("skipped", []) if s != old_rel_path] + if save: + save_cache(cache) + + return {"ok": True, "old_path": old_full, "new_path": new_full} + + +def rename_files_batch( + renames: list[dict], + cache: dict, + rclone_bin: str = "rclone", +) -> list[dict]: + """Rename multiple files, writing cache once at the end. + + Each item in renames: {remote, old_path, new_path} + Returns list of per-file results with old_path/new_path echoed back. + """ + results = [] + cache_dirty = False + for r in renames: + res = rename_file_in_remote( + r["remote"], r["old_path"], r["new_path"], + cache, rclone_bin=rclone_bin, save=False, + ) + res["old_path"] = r["old_path"] + res["new_path"] = r["new_path"] + results.append(res) + if res["ok"]: + cache_dirty = True + if cache_dirty: + save_cache(cache) + return results + + +# ---------- plain renderers (--basic) ---------- + +def render_banner_plain(cache_meta: dict[str, dict], mode: str) -> str: + lines = [ansi(f"=== rc-jav ({mode}) ===", ANSI_BOLD)] + for r, m in cache_meta.items(): + if m["cached"]: + tag = f"CACHED {m['age']}" + (" STALE" if m["stale"] else "") + tag_c = ansi(tag, ANSI_YELLOW if m["stale"] else ANSI_DIM) + else: + tag_c = ansi("FRESH SCAN", ANSI_GREEN) + count_str = ansi(f"({m['file_count']} files)", ANSI_DIM) + lines.append(f" {r} {tag_c} {count_str}") + return "\n".join(lines) + + +def render_search_plain(matches: dict[str, list[FileEntry]], queries: list[str], + cache_meta: dict[str, dict]) -> str: + lines: list[str] = [] + if cache_meta: + lines.append(render_banner_plain(cache_meta, "search")) + lines.append("") + for q in queries: + hits = matches.get(q, []) + if not hits: + lines.append(ansi(f"[{q}] NOT FOUND", ANSI_RED)) + lines.append("") + continue + lines.append(ansi(f"[{q}] {len(hits)} hit(s)", ANSI_GREEN + ANSI_BOLD)) + for e in sorted(hits, key=lambda x: (x.jav_id, x.path.lower())): + meta = cache_meta.get(e.remote, {}) + if meta.get("cached"): + tag = ansi("[CACHED-STALE]", ANSI_YELLOW) if meta.get("stale") else ansi("[CACHED]", ANSI_DIM) + else: + tag = ansi("[FRESH]", ANSI_GREEN) + src = ansi(e.source, ANSI_YELLOW) + lines.append(f" {src} {tag}") + lines.append(ansi(f" file: {Path(e.path).name}", ANSI_BOLD)) + lines.append(f" size: {human_size(e.size)} ({e.size:,} bytes)") + lines.append(ansi(f" path: {e.full_path}", ANSI_DIM)) + lines.append("") + return "\n".join(lines) + + +# ---------- file outputs ---------- + +def render_dupes_plain(dupes, skipped, variant_alerts=None) -> str: + lines: list[str] = [] + if not dupes: + lines.append(ansi("No duplicates found.", ANSI_GREEN)) + else: + lines.append(ansi(f"Found {len(dupes)} duplicate ID group(s):", ANSI_BOLD)) + lines.append("") + total_reclaim = 0 + for jav_id in sorted(dupes): + entries = dupes[jav_id] + keep = decide_keep(entries) + lines.append(ansi(f"[{jav_id}]", ANSI_BOLD)) + for e in sorted(entries, key=lambda x: (x.source != "Source", x.source == "Catalog", -x.size)): + if e.source == "Catalog": + mark = ansi("CATALOG ", ANSI_CYAN) + elif e is keep: + mark = ansi("KEEP ", ANSI_GREEN) + else: + mark = ansi("DELETE? ", ANSI_RED) + total_reclaim += e.size + src = ansi(f"{e.source:>8}", ANSI_YELLOW) + size_str = f"{human_size(e.size)} ({e.size:,} B)" + lines.append(f" {mark} {src} {size_str:>26} {e.full_path}") + lines.append("") + lines.append(ansi(f"Potential space reclaim if all DELETE? removed: {human_size(total_reclaim)}", ANSI_BOLD)) + if skipped: + lines.append("") + lines.append(ansi(f"Skipped {len(skipped)} file(s) with no parseable ID:", ANSI_DIM)) + for remote, path in skipped[:50]: + lines.append(ansi(f" {remote} {path}", ANSI_DIM)) + if len(skipped) > 50: + lines.append(ansi(f" ... +{len(skipped) - 50} more", ANSI_DIM)) + if variant_alerts: + lines.append("") + lines.append(ansi(f"⚠ {len(variant_alerts)} variant alert(s) — manual review required:", ANSI_YELLOW + ANSI_BOLD)) + for bare_id, entries in sorted(variant_alerts.items()): + lines.append(ansi(f" [{bare_id}] bare + variant coexist", ANSI_YELLOW)) + for e in sorted(entries, key=lambda x: x.full_path): + eid = extract_id(Path(e.path).name) or e.jav_id + lines.append(f" {ansi(eid, ANSI_YELLOW)} {human_size(e.size):>10} {e.full_path}") + return "\n".join(lines) + + +def write_txt(path: Path, dupes, skipped): + path.write_text(render_dupes_plain(dupes, skipped), encoding="utf-8") + + +def write_csv(path: Path, dupes): + with path.open("w", newline="", encoding="utf-8") as f: + w = csv.writer(f) + w.writerow(["jav_id", "action", "source", "remote", "path", "full_path", + "size_bytes", "size_human", "mod_time"]) + for jav_id in sorted(dupes): + entries = dupes[jav_id] + keep = decide_keep(entries) + for e in entries: + if e.source == "Catalog": + action = "CATALOG" + elif e is keep: + action = "KEEP" + else: + action = "DELETE?" + w.writerow([jav_id, action, e.source, + e.remote, e.path, e.full_path, e.size, human_size(e.size), e.mod_time]) + + +def describe_skipped_id(remote: str, path: str) -> dict[str, str]: + """Explain a common reason a path did not yield an ID.""" + name = Path((path or "").replace("\\", "/")).name + reason = "No supported JAV ID at filename start" + hint = "Rename with a leading ID such as ABC-123 or add an ID normalizer/site-specific source." + if re.match(r"^\[[A-Za-z0-9-]+-\d+\]", name): + reason = "ID is wrapped in leading brackets" + hint = "Remove the leading brackets so the filename starts with the ID." + elif re.match(r"^[A-Za-z][A-Za-z0-9]+[\u2010-\u2015]\d+", name): + reason = "ID uses a non-ASCII dash" + hint = "Replace the separator with a normal hyphen." + elif re.match(r"^[A-Za-z][A-Za-z0-9]+\d+", name): + reason = "ID prefix and number have no hyphen" + hint = "Insert the ID hyphen, for example ABC-123." + return {"remote": remote, "path": path, "name": name, "reason": reason, "hint": hint} + + +def dupes_to_obj(dupes, skipped, variant_alerts=None) -> dict: + out = {"groups": {}, "skipped": [describe_skipped_id(r, p) for r, p in skipped], + "variant_alerts": []} + for jav_id in sorted(dupes): + entries = dupes[jav_id] + keep, keep_reason = decide_keep_with_reason(entries) + out["groups"][jav_id] = { + "keep": asdict(keep) | {"full_path": keep.full_path, "size_human": human_size(keep.size)}, + "keep_reason": keep_reason, + "risks": describe_dupe_risks(jav_id, entries), + "delete_candidates": [asdict(e) | {"full_path": e.full_path, "size_human": human_size(e.size)} + for e in entries + if e is not keep and e.source != "Catalog"], + "catalog": [asdict(e) | {"full_path": e.full_path, "size_human": human_size(e.size)} + for e in entries if e.source == "Catalog"], + } + for bare_id, entries in sorted((variant_alerts or {}).items()): + out["variant_alerts"].append({ + "bare_id": bare_id, + "files": [ + asdict(e) | {"full_path": e.full_path, "size_human": human_size(e.size), + "detected_id": extract_id(Path(e.path).name) or e.jav_id} + for e in sorted(entries, key=lambda x: x.full_path) + ], + }) + return out + + +def write_json(path: Path, dupes, skipped, variant_alerts=None): + path.write_text(json.dumps(dupes_to_obj(dupes, skipped, variant_alerts), indent=2), encoding="utf-8") + + +# ---------- main ---------- + +def main(): + ap = argparse.ArgumentParser(description="Report duplicate JAV files across rclone remotes (read-only).") + ap.add_argument("--source", "-s", action="append", default=[], metavar="REMOTE", + help="Source remote path (priority — wins dupes regardless of size). Repeatable.") + ap.add_argument("--target", "-t", action="append", default=[], metavar="REMOTE", + help="Target remote path (non-priority — largest size wins among targets). Repeatable.") + ap.add_argument("--format", choices=["console", "txt", "csv", "json", "all"], + default="console") + ap.add_argument("--output-dir", default="./reports", help="Where to write txt/csv/json.") + ap.add_argument("--no-color", action="store_true") + ap.add_argument("--rclone-bin", default="rclone", + help="Path to rclone executable (default: 'rclone' on PATH).") + ap.add_argument("--search", action="append", default=[], metavar="ID", + help="Search mode: look up a JAV ID (e.g. SSIS-001). Repeatable. " + "If no --source/--target given, default target is used.") + ap.add_argument("--name", action="append", default=[], metavar="STR", + help="Substring/glob search against filename. Case-insensitive. " + "Repeatable; OR semantics (any token match = hit). " + "Supports * and ? wildcards. Use quotes for spaces.") + ap.add_argument("--update", "-u", action="store_true", + help="Search mode: force re-scan and overwrite cache for requested remotes.") + ap.add_argument("--no-cache", action="store_true", + help="Search mode: bypass cache entirely (no read, no write).") + ap.add_argument("--quick", "-q", action="store_true", + help="Force quick mode: skip cache, query rclone directly with --include glob. " + "Default is auto: single exact IDs use quick, wildcards/ranges/multi use cached.") + ap.add_argument("--cache", action="store_true", + help="Force cached mode (opposite of --quick).") + ap.add_argument("--save", action="store_true", + help="Persist the --source / --target / --catalog values you passed " + "as new defaults in config.json next to the script. " + "Only keys you explicitly passed are saved.") + ap.add_argument("--scan", action="store_true", + help="Walk configured remotes, refresh cache, exit. No search/dupe output. " + "Default scope: DEFAULT_TARGET. Override with --source/--target. " + "Always overwrites cache. Suitable for Task Scheduler / cron.") + ap.add_argument("--scan-since", metavar="DURATION", + help="Incremental scan: only walk files modified within DURATION " + "(e.g. 24h, 7d, 30m, 90s). Merges new/changed entries on top of " + "the existing cache; old entries are preserved. Falls back to a " + "full scan if there's no prior cache for a remote. Requires --scan.") + ap.add_argument("--catalog", action="append", default=[], metavar="PATH", + help="Path to a WinCatalog CSV or XML export. Repeatable. " + "Listed under 'Catalog' in results (informational, never KEEP/DELETE?).") + ap.add_argument("--part-pattern", action="append", default=[], metavar="REGEX", + help="Extra multipart filename regex. Repeatable; first capture group must be the part number. " + "Patterns run against the filename stem after built-in part detectors.") + ap.add_argument("--library-issues", action="store_true", + help="Report non-canonical filenames (bracket-wrapped IDs, no-hyphen IDs). " + "Reads from cache. Outputs JSON when --format json, plain otherwise.") + ap.add_argument("--rename-file", action="store_true", + help="Rename one file in a remote and patch cache. " + "Requires --remote, --old-path, --new-path. Outputs JSON.") + ap.add_argument("--rename-files-batch", action="store_true", + help="Rename multiple files in one call, writing cache once. " + "Reads JSON array of {remote, old_path, new_path} from stdin. Outputs JSON.") + ap.add_argument("--remote", metavar="REMOTE", + help="Remote path root for --rename-file (e.g. cq:JAV).") + ap.add_argument("--old-path", metavar="PATH", + help="Relative path of the file to rename (within --remote).") + ap.add_argument("--new-path", metavar="PATH", + help="New relative path after rename (within --remote).") + ap.add_argument("--basic", action="store_true", + help="Plain text output, no rich tables/panels/progress bars. " + "Useful for piping or low-bandwidth terminals.") + ap.add_argument("--clearjav", action="store_true", + help="Shortcut: use DEFAULT_SOURCE as --source and DEFAULT_TARGET as --target, " + "Equivalent to " + "`--source cq:personal-files/ClearJAV --target cq:personal-files/JAV/TMP`.") + args = ap.parse_args() + + global RCLONE_BIN, console, BASIC, DEFAULT_SOURCE, DEFAULT_TARGET, DEFAULT_CATALOG + RCLONE_BIN = args.rclone_bin + BASIC = args.basic or args.format == "json" + + # Apply persisted config overrides BEFORE defaults are consulted. + cfg = load_config() + if "default_source" in cfg: + DEFAULT_SOURCE = list(cfg["default_source"]) + if "default_target" in cfg: + DEFAULT_TARGET = list(cfg["default_target"]) + if "default_catalog" in cfg: + DEFAULT_CATALOG = list(cfg["default_catalog"]) + global _KEEP_RANKING + _KEEP_RANKING = cfg.get("keep_ranking") or {} + part_patterns = list(cfg.get("part_patterns") or []) + list(args.part_pattern) + pattern_errors = configure_part_patterns(part_patterns) + if pattern_errors: + for err in pattern_errors: + console.print(f"[red]invalid part pattern:[/] {err}") + sys.exit(2) + + # --save: persist explicitly-passed values, exit. + if args.save: + if not (args.source or args.target or args.catalog or args.part_pattern): + console.print("[red]--save needs at least one --source/--target/--catalog/--part-pattern value to persist.[/]") + sys.exit(2) + new_cfg = dict(cfg) + if args.source: + new_cfg["default_source"] = list(args.source) + if args.target: + new_cfg["default_target"] = list(args.target) + if args.catalog: + new_cfg["default_catalog"] = list(args.catalog) + if args.part_pattern: + new_cfg["part_patterns"] = list(args.part_pattern) + save_config(new_cfg) + console.print(f"[green]Saved to {CONFIG_PATH}:[/]") + for k in ("default_source", "default_target", "default_catalog", "part_patterns"): + if k in new_cfg: + console.print(f" {k} = {new_cfg[k]}") + sys.exit(0) + global USE_ANSI + USE_ANSI = not args.no_color + if args.no_color or BASIC: + console = Console(no_color=True, color_system=None, highlight=False) + + # Search mode: defaults kick in if no remotes specified. + if args.clearjav: + if not args.source: + args.source = list(DEFAULT_SOURCE) + if not args.target: + args.target = list(DEFAULT_TARGET) + + if args.search and not args.source and not args.target: + args.target = list(DEFAULT_TARGET) + + # --scan: default to DEFAULT_TARGET only, always overwrite cache. + if args.scan: + if not args.source and not args.target: + args.target = list(DEFAULT_TARGET) + args.update = True + + # Use default catalog(s) if user passed none. + if not args.catalog and DEFAULT_CATALOG: + args.catalog = list(DEFAULT_CATALOG) + + # --library-issues: read-only cache scan for non-canonical filenames. + if args.library_issues: + cache = load_cache() + issues = find_library_issues(cache) + if args.format == "json" or BASIC: + print(json.dumps({"ok": True, **issues})) + else: + bracket = issues["bracket_names"] + nohyphen = issues["nohyphen_names"] + total = len(bracket) + len(nohyphen) + if not total: + console.print(Panel("[bold green]No library issues found.[/]", title="Library Issues")) + else: + from rich.table import Table + t = Table(title=f"Library Issues ({total} file(s))", show_lines=True) + t.add_column("Issue", style="yellow", width=14) + t.add_column("Current Name") + t.add_column("Canonical Name", style="green") + t.add_column("Remote", style="dim") + for e in bracket: + t.add_row("bracket ID", Path(e["path"]).name, + e["canonical_name"], e["remote"]) + for e in nohyphen: + t.add_row("no hyphen", Path(e["path"]).name, + e["canonical_name"], e["remote"]) + console.print(t) + sys.exit(0) + + # --rename-files-batch: rename multiple files, single cache write. + if args.rename_files_batch: + try: + renames = json.loads(sys.stdin.read()) + except json.JSONDecodeError as e: + print(json.dumps({"ok": False, "error": f"Invalid JSON on stdin: {e}"})) + sys.exit(1) + if not isinstance(renames, list): + print(json.dumps({"ok": False, "error": "stdin must be a JSON array"})) + sys.exit(1) + cache = load_cache() + results = rename_files_batch(renames, cache, rclone_bin=RCLONE_BIN) + ok = any(r["ok"] for r in results) + print(json.dumps({"ok": ok, "results": results})) + sys.exit(0 if ok else 1) + + # --rename-file: rename one file in a remote and patch cache. + if args.rename_file: + if not args.remote or not args.old_path or not args.new_path: + ap.error("--rename-file requires --remote, --old-path, and --new-path.") + cache = load_cache() + result = rename_file_in_remote( + args.remote, args.old_path, args.new_path, cache, rclone_bin=RCLONE_BIN + ) + print(json.dumps(result)) + sys.exit(0 if result["ok"] else 1) + + if not args.source and not args.target and not args.catalog: + ap.error("Provide at least one --source, --target, or --catalog.") + + # Scan-only mode: walk remotes, write cache, summary, exit. + if args.scan: + scan_since = None + if args.scan_since: + scan_since = parse_duration(args.scan_since) + if not scan_since: + console.print(f"[red]invalid --scan-since value: {args.scan_since!r} " + f"(expected e.g. 24h, 7d, 30m, 90s)[/]") + sys.exit(2) + cache = load_cache() + cache_meta: dict[str, dict] = {} + skipped: list[tuple[str, str]] = [] + t0 = time.perf_counter() + if BASIC: + # `--scan` resolves its default target above. Report only the + # remotes that this scan will actually walk; falling back here to + # DEFAULT_SOURCE would resurrect retired source roots in job UI. + _all_remotes = list(args.source) + list(args.target) + sys.stderr.write("SCAN_START " + json.dumps({ + "remotes": _all_remotes, "total": len(_all_remotes), + }) + "\n") + sys.stderr.flush() + entries = (cached_collect(args.source, "Source", skipped, cache, + use_cache=not args.no_cache, force_update=True, + cache_meta=cache_meta, scan_since=scan_since) + + cached_collect(args.target, "Target", skipped, cache, + use_cache=not args.no_cache, force_update=True, + cache_meta=cache_meta, scan_since=scan_since)) + if not args.no_cache: + save_cache(cache) + elapsed = time.perf_counter() - t0 + if BASIC: + sys.stderr.write(f"Scan complete: {len(entries)} files in {elapsed:.2f}s\n") + sys.stderr.write(f"Cache: {CACHE_PATH}\n" if not args.no_cache + else "Cache: (skipped, --no-cache)\n") + else: + console.print(f"[bold green]Scan complete:[/] {len(entries)} files in {elapsed:.2f}s") + if not args.no_cache: + console.print(f"[dim]Cache: {CACHE_PATH}[/]") + else: + console.print("[dim]Cache: (skipped, --no-cache)[/]") + sys.exit(0) + + skipped: list[tuple[str, str]] = [] + t0 = time.perf_counter() + + if args.search or args.name: + search_timings: dict[str, int] = {} + # If --name was passed without explicit remotes, fall back to default target + # (catalog default already injected earlier; don't let it suppress remote defaulting). + if args.name and not args.search and not args.source and not args.target: + args.target = list(DEFAULT_TARGET) + # Substring name search can't be server-side filtered on most backends — cache wins. + # Only the ID search shape benefits from quick (server-side prefix glob). + if args.name and not args.quick: + mode, reason = "cached", "name substring search — cache is faster than rclone --include" + else: + combined = list(args.search) + list(args.name) + mode, reason = choose_search_mode(combined, args.quick, args.cache) + if BASIC: + sys.stderr.write(f"Mode: {mode} ({reason})\n") + else: + mode_color = "green" if mode == "quick" else "cyan" + console.print(f"[{mode_color}]Mode: {mode}[/] [dim]({reason})[/]") + + phase_t0 = time.perf_counter() + cache = load_cache() + search_timings["cache_load_ms"] = round((time.perf_counter() - phase_t0) * 1000) + use_cache = not args.no_cache and mode == "cached" + cache_meta: dict[str, dict] = {} + phase_t0 = time.perf_counter() + if mode == "quick": + all_patterns: list[str] = [] + for raw in args.search: + all_patterns.extend(query_to_include_patterns(raw)) + all_patterns.extend(name_to_include_patterns(args.name)) + entries = [] + for r in args.source: + cache_meta[r] = {"cached": False, "age": "quick", "stale": False, "file_count": 0} + got = quick_search_remote(r, "Source", all_patterns, skipped) + entries.extend(got) + cache_meta[r]["file_count"] = len(got) + for r in args.target: + cache_meta[r] = {"cached": False, "age": "quick", "stale": False, "file_count": 0} + got = quick_search_remote(r, "Target", all_patterns, skipped) + entries.extend(got) + cache_meta[r]["file_count"] = len(got) + else: + entries = (cached_collect(args.source, "Source", skipped, cache, + use_cache, args.update, cache_meta) + + cached_collect(args.target, "Target", skipped, cache, + use_cache, args.update, cache_meta)) + search_timings["entry_collect_ms"] = round((time.perf_counter() - phase_t0) * 1000) + # Load each catalog separately so cache_meta gets the per-catalog count + # (was global total — every catalog reported the sum across all). + catalog_entries: list[FileEntry] = [] + phase_t0 = time.perf_counter() + for cp_str in args.catalog: + for cp in _expand_catalog_paths([cp_str]): + ext = cp.suffix.lower() + if ext == ".csv": + one = load_catalog_csv(cp, skipped) + elif ext == ".xml": + one = load_catalog_xml(cp, skipped) + else: + console.print(f"[yellow]WARN: unknown catalog format '{ext}' for {cp}; skipping.[/]") + continue + catalog_entries.extend(one) + cache_meta[f"catalog:{cp.name}"] = { + "cached": False, "age": "loaded", "stale": False, + "file_count": len(one), + } + entries.extend(catalog_entries) + search_timings["catalog_load_ms"] = round((time.perf_counter() - phase_t0) * 1000) + if use_cache and args.update: + save_cache(cache) + else: + if args.cache and not args.no_cache: + cache = load_cache() + cache_meta: dict[str, dict] = {} + entries = (cached_collect(args.source, "Source", skipped, cache, + use_cache=True, force_update=False, + cache_meta=cache_meta) + + cached_collect(args.target, "Target", skipped, cache, + use_cache=True, force_update=False, + cache_meta=cache_meta)) + else: + remotes_by_label = ([("Source", r) for r in args.source] + + [("Target", r) for r in args.target]) + entries = collect_with_progress(remotes_by_label, skipped) + entries.extend(load_catalogs(args.catalog, skipped)) + + elapsed = time.perf_counter() - t0 + if BASIC: + sys.stderr.write(f"Scanned/loaded {len(entries)} file(s) in {elapsed:.2f}s\n") + else: + console.print(f"[dim]Scanned/loaded {len(entries)} file(s) in {elapsed:.2f}s[/]") + + if args.search or args.name: + # query_expansions: original_raw -> list of normalized IDs / wildcard patterns to look up + query_expansions: dict[str, list[str]] = {} + queries: list[str] = [] + for raw in args.search: + if RANGE_RE.search(raw): + expanded = expand_range(raw) or [] + normed: list[str] = [] + for r in expanded: + n = normalize_id(r) + if n: + normed.append(n) + if not normed: + console.print(f"[yellow]WARN: range '{raw}' produced no valid IDs.[/]") + continue + queries.append(raw) + query_expansions[raw] = normed + continue + if "*" in raw or "?" in raw: + q = raw.upper() + queries.append(q) + query_expansions[q] = [q] + continue + norm = normalize_id(raw) + if not norm: + console.print(f"[yellow]WARN: cannot parse '{raw}' as a JAV ID, skipping.[/]") + continue + # Use the raw (upper-cased) form for display so leading zeros are preserved + # (e.g. user types PRTD-027 — keep it, don't show PRTD-27). Lookup still uses + # the normalized form internally. + display = raw.upper() + queries.append(display) + query_expansions[display] = [norm] + phase_t0 = time.perf_counter() + index: dict[str, list[FileEntry]] = {} + for e in entries: + index.setdefault(e.jav_id, []).append(e) + search_timings["index_ms"] = round((time.perf_counter() - phase_t0) * 1000) + phase_t0 = time.perf_counter() + matches: dict[str, list[FileEntry]] = {} + match_traces: dict[str, dict[int, dict[str, str]]] = {} + for q in queries: + expansions = query_expansions.get(q, [q]) + hits: list[FileEntry] = [] + seen: set[int] = set() + traces: dict[int, dict[str, str]] = {} + + def add_hit(entry: FileEntry, matched_query: str) -> None: + key = id(entry) + if key in seen: + return + seen.add(key) + hits.append(entry) + traces[key] = describe_id_match(q, matched_query, entry.jav_id, len(expansions)) + + for sub in expansions: + if "*" in sub or "?" in sub: + pat = sub if "#PART" in sub.upper() else sub + "*" + for k, v in index.items(): + if fnmatch.fnmatchcase(k, pat): + for e in v: + add_hit(e, sub) + elif "#part" in sub: + for e in index.get(sub, []): + add_hit(e, sub) + else: + for e in index.get(sub, []): + add_hit(e, sub) + for k, v in index.items(): + if k.startswith(sub + "#part"): + for e in v: + add_hit(e, sub) + matches[q] = hits + match_traces[q] = traces + search_timings["match_ms"] = round((time.perf_counter() - phase_t0) * 1000) + if args.format == "json": + # Structured output for tools that consume search results (e.g. the rclonex + # Brave extension). Includes everything needed to drive a UI: per-query hits + # with source/remote/path/size/mod_time, plus name-match block + skipped. + name_hits_json: list[FileEntry] = [] + if args.name: + for e in entries: + if name_match(Path(e.path).stem, args.name): + name_hits_json.append(e) + out_obj = { + "queries": [ + { + "query": q, + "hits": [ + {"source": e.source, "remote": e.remote, "path": e.path, + "full_path": e.full_path, "size": e.size, + "size_human": human_size(e.size), + "mod_time": e.mod_time, "jav_id": e.jav_id, + **match_traces.get(q, {}).get(id(e), {})} + for e in sorted(matches.get(q, []), key=lambda x: (x.jav_id, x.path.lower())) + ], + } + for q in queries + ], + "name_matches": [ + {"source": e.source, "remote": e.remote, "path": e.path, + "full_path": e.full_path, "size": e.size, + "size_human": human_size(e.size), "mod_time": e.mod_time, + "jav_id": e.jav_id, "match_kind": "name", + "match_reason": "Filename search", "match_confidence": "broad", + "matched_query": ", ".join(args.name), "matched_id": e.jav_id} + for e in sorted(name_hits_json, key=lambda x: (x.jav_id, x.path.lower())) + ], + "name_tokens": list(args.name), + "cache_meta": cache_meta, + "skipped_count": len(skipped), + "elapsed_sec": round(time.perf_counter() - t0, 3), + "timings": search_timings, + } + print(json.dumps(out_obj)) + id_ok = (not queries) or all(matches.values()) + name_ok = (not args.name) or bool(name_hits_json) + sys.exit(0 if (id_ok and name_ok) else 1) + if queries: + if BASIC: + print(render_search_plain(matches, queries, cache_meta)) + else: + render_search(matches, queries, cache_meta) + # --name results as a separate block + name_hits: list[FileEntry] = [] + if args.name: + for e in entries: + if name_match(Path(e.path).stem, args.name): + name_hits.append(e) + if BASIC: + print(render_name_matches_plain(name_hits, args.name, cache_meta)) + else: + render_name_matches(name_hits, args.name, cache_meta) + # Exit code: 0 if every search query had hits AND name-search (if used) returned hits. + id_ok = (not queries) or all(matches.values()) + name_ok = (not args.name) or bool(name_hits) + sys.exit(0 if (id_ok and name_ok) else 1) + + dupes = find_dupes(entries) + variant_alerts = find_variant_alerts(entries) + if args.format == "json" and BASIC: + print(json.dumps(dupes_to_obj(dupes, skipped, variant_alerts))) + sys.exit(0) + if BASIC: + print(render_dupes_plain(dupes, skipped, variant_alerts)) + else: + render_dupes(dupes, skipped, variant_alerts) + + if args.format != "console": + out_dir = Path(args.output_dir) + out_dir.mkdir(parents=True, exist_ok=True) + stamp = datetime.now().strftime("%Y%m%d-%H%M%S") + targets = {"txt", "csv", "json"} if args.format == "all" else {args.format} + if "txt" in targets: + write_txt(out_dir / f"dupes-{stamp}.txt", dupes, skipped) + if "csv" in targets: + write_csv(out_dir / f"dupes-{stamp}.csv", dupes) + if "json" in targets: + write_json(out_dir / f"dupes-{stamp}.json", dupes, skipped, variant_alerts) + console.print(f"[dim]Reports written to {out_dir}[/]") + + +if __name__ == "__main__": + try: + main() + except KeyboardInterrupt: + console.print("\n[yellow]Aborted by user (Ctrl+C). Cache not written for in-flight scans.[/]") + sys.exit(130) diff --git a/tests/test_rules.py b/tests/test_rules.py new file mode 100644 index 0000000..5e4c147 --- /dev/null +++ b/tests/test_rules.py @@ -0,0 +1,74 @@ +import importlib.util +import sys +import unittest +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +SPEC = importlib.util.spec_from_file_location("rcjav_rules", ROOT / "rc-jav.py") +RCJAV = importlib.util.module_from_spec(SPEC) +sys.modules[SPEC.name] = RCJAV +SPEC.loader.exec_module(RCJAV) + + +def entry(path, size=1_000, jav_id="TEST-001"): + return RCJAV.FileEntry( + source="Target", + remote="cq:JAV", + path=path, + size=size, + mod_time="", + jav_id=jav_id, + ) + + +class IdRuleTests(unittest.TestCase): + def test_builtin_multipart_shapes_keep_parts_distinct(self): + expected = { + "KV-118 - Aiba Reika_PART1.mp4": "KV-118#part1", + "KV-118_A.mp4": "KV-118#part1", + "OFJE-195-7 [480p].mp4": "OFJE-195#part7", + "ABC-027.mp4": "ABC-027", + } + for name, jav_id in expected.items(): + with self.subTest(name=name): + self.assertEqual(RCJAV.extract_id(name), jav_id) + + def test_multipart_files_do_not_form_base_duplicate_group(self): + files = [ + entry("KV-118 - Aiba Reika_PART1.mp4", jav_id="KV-118"), + entry("KV-118 - Aiba Reika_PART2.mp4", jav_id="KV-118"), + entry("KV-118 - Aiba Reika_PART3.mp4", jav_id="KV-118"), + ] + self.assertEqual(RCJAV.find_dupes(files), {}) + + def test_large_same_id_group_gets_manual_review_risk(self): + files = [ + entry("TEST-001 direct.mp4"), + entry("TEST-001 edit.mp4"), + entry("TEST-001 mirror.mp4"), + ] + risks = RCJAV.describe_dupe_risks("TEST-001", files) + self.assertIn("large_same_id_group", {risk["code"] for risk in risks}) + + +class KeepRankingTests(unittest.TestCase): + def test_vip_folder_beats_larger_non_vip_copy(self): + keep, reason = RCJAV.decide_keep_with_reason([ + entry("ClearJAV/TEST-001.mp4", size=2_000), + entry("Other/TEST-001 [1080p].mkv", size=9_000), + ]) + self.assertEqual(keep.path, "ClearJAV/TEST-001.mp4") + self.assertEqual(reason["code"], "vip_folder") + + def test_ts_loses_to_non_ts_even_when_larger(self): + keep, reason = RCJAV.decide_keep_with_reason([ + entry("Other/TEST-001.ts", size=9_000), + entry("Other/TEST-001.mp4", size=8_000), + ]) + self.assertEqual(keep.path, "Other/TEST-001.mp4") + self.assertEqual(reason["code"], "container") + + +if __name__ == "__main__": + unittest.main()