From ba57b7fd2164ac8f0612b938d2a792902a020ae2 Mon Sep 17 00:00:00 2001 From: admin Date: Fri, 22 May 2026 21:43:57 +0200 Subject: [PATCH] Step 10a + 10b: scaffold rcjav/ package, extract ID rules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Carves the first slice out of the monolithic rc-jav.py (now 2017 lines, was 2230). Two new modules: rcjav/model.py FileEntry dataclass — the one shared shape that every other submodule will need. rcjav/ids.py Single source of truth for everything that influences a FileEntry.jav_id: PRIMARY_ID_RE, FALLBACK_ID_RE, COMPOUND_ID_RE, BUILTIN_PART_RES, configure_part_patterns, detect_part, detect_part_from_stem, part_key, extract_id, normalize_id, describe_id_match, expand_range, plus the supporting "private" regexes (_BRACKET_ID_RE, _RESOLUTION_TAG_RE, etc.) that other code in rc-jav.py still reads. rcjav/__init__.py re-exports the public surface so future external consumers can `from rcjav import extract_id` without caring which submodule it lives in. rc-jav.py drops the inline ID block and pulls everything from rcjav.ids via a single import statement. PART_RES is intentionally NOT imported — it's mutated by configure_part_patterns at runtime, so a captured top-level reference would go stale. A small helper `_current_part_res()` reads it dynamically via `_rcjav_ids.PART_RES`. fixtures/run.py fix: synthesized importlib module name changed from "rcjav" (which now collides with the real package directory) to "rcjav_script". Also prepends ROOT to sys.path so rc-jav.py's `from rcjav.model import …` resolves when run as `python fixtures/run.py`. Verified: - python rc-jav.py --help → usage banner prints - python fixtures/run.py → 17/17 cases pass - python -m unittest tests.test_rules → 5/5 OK Co-Authored-By: Claude Opus 4.7 --- fixtures/run.py | 9 +- rc-jav.py | 267 +++++----------------------------------------- rcjav/__init__.py | 24 +++++ rcjav/ids.py | 243 +++++++++++++++++++++++++++++++++++++++++ rcjav/model.py | 24 +++++ 5 files changed, 327 insertions(+), 240 deletions(-) create mode 100644 rcjav/__init__.py create mode 100644 rcjav/ids.py create mode 100644 rcjav/model.py diff --git a/fixtures/run.py b/fixtures/run.py index 3f97e68..e541fe3 100644 --- a/fixtures/run.py +++ b/fixtures/run.py @@ -15,7 +15,14 @@ from pathlib import Path ROOT = Path(__file__).resolve().parents[1] FIXTURES = Path(__file__).resolve().parent -SPEC = importlib.util.spec_from_file_location("rcjav", ROOT / "rc-jav.py") +# rc-jav.py now imports from the `rcjav/` package at ROOT, so the parent dir +# must be on sys.path before we exec it. +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + +# Synthesized module name avoids collision with the real `rcjav` package +# directory (rc-jav.py now imports from it). +SPEC = importlib.util.spec_from_file_location("rcjav_script", ROOT / "rc-jav.py") RCJAV = importlib.util.module_from_spec(SPEC) sys.modules[SPEC.name] = RCJAV SPEC.loader.exec_module(RCJAV) diff --git a/rc-jav.py b/rc-jav.py index 37fc844..bbc49a5 100644 --- a/rc-jav.py +++ b/rc-jav.py @@ -32,112 +32,36 @@ from rich.progress import ( from rich.table import Table from rich.text import Text -PRIMARY_ID_RE = re.compile(r"^([A-Za-z]+)-(\d+)") -FALLBACK_ID_RE = re.compile(r"^([A-Za-z0-9]+)-(\d+)") -COMPOUND_ID_RE = re.compile(r"^([A-Za-z0-9]+(?:-[A-Za-z0-9]+)+)-(\d+)") - -# Part-suffix patterns: anchored at end of stem (after stripping extension). -# Each pattern's group(1) is the part number. -RANGE_RE = re.compile(r"\[(\d+)-(\d+)\]") - -# Non-anchored XofY probe used in detect_part() to resolve the priority conflict -# between a trailing (N) copy-marker suffix and an embedded XofY part indicator. -# Example: "ENKI-031 [1080p].2of2 (1)" — the (1) is a filesystem collision suffix -# (rclone, Windows copy), not a part number; the 2of2 is the real part indicator. -# This pattern intentionally has no end-anchor so it matches anywhere in the stem. -_XOFY_PRIORITY_RE = re.compile(r"[._ -](\d+)\s*of\s*\d+", re.IGNORECASE) - -BUILTIN_PART_RES = [ - re.compile(r"[-_ ](?:pt|part|cd|disc)[-_ ]?(\d+)$", re.IGNORECASE), - re.compile(r"\s*\((\d+)(?:\s*of\s*\d+)?\)$", re.IGNORECASE), - # Exported multipart filenames often end in `.1of2` / `-2 of 4`. - re.compile(r"[._ -](\d+)\s*of\s*\d+$", re.IGNORECASE), - # Bare numeric suffixes (`_N`, ` N`) are only treated as part numbers when - # the number is 1-2 digits. Wider patterns falsely matched resolution tags - # (`_2160`, `_4K2160`) and dates/years (`SSIS-001 2023.mp4` -> `#part2023`), - # corrupting cache keys. - # Staged detection also retries after resolution/actress cleanup, so end - # anchors can match both raw suffixes and metadata-blocked suffixes safely. - re.compile(r"_(\d{1,2})$"), - # Hyphen short-part suffix after the ID, e.g. OFJE-195-1 [480p].mp4. - # Limit to 1-2 digits so the base ID's usual 3+ digit numeric component - # does not make every canonical `ABC-123` filename look multipart. - re.compile(r"-(\d{1,2})$"), - # Lettered parts: separator (hyphen or underscore) followed by A-D. - # Uppercase only — lowercase letters are variant designators (e.g. IBW-902z) - # and are preserved as part of the base ID, not treated as part numbers. - re.compile(r"[-_]([A-D])$"), - # Bare uppercase letter directly after the ID digits with no separator, - # e.g. BAK-052A, BAK-052B. Lookbehind ensures a digit precedes. - re.compile(r"(?<=\d)([A-D])$"), - re.compile(r"\s+(\d{1,2})$"), -] -PART_RES = list(BUILTIN_PART_RES) +from rcjav.model import FileEntry +from rcjav import ids as _rcjav_ids +from rcjav.ids import ( + PRIMARY_ID_RE, + FALLBACK_ID_RE, + COMPOUND_ID_RE, + RANGE_RE, + BUILTIN_PART_RES, + configure_part_patterns, + detect_part, + detect_part_from_stem, + part_key, + extract_id, + normalize_id, + describe_id_match, + expand_range, + _VARIANT_SUFFIX_RE, + _RES_LABEL_RE, + _RESOLUTION_TAG_RE, + _BRACKET_ID_RE, + _NOHYPHEN_ID_RE, + _VIDEO_EXTS, + _LOWEST_KEEP_PRIORITY_EXTS, +) -def configure_part_patterns(patterns: Iterable[str]) -> list[str]: - """Extend part suffix detection with user regexes whose first group is part number.""" - global PART_RES - PART_RES = list(BUILTIN_PART_RES) - errors: list[str] = [] - for pattern in patterns: - pattern = str(pattern or "").strip() - if not pattern: - continue - try: - compiled = re.compile(pattern, re.IGNORECASE) - except re.error as e: - errors.append(f"{pattern!r}: {e}") - continue - if compiled.groups < 1: - errors.append(f"{pattern!r}: needs a capture group for the part number") - continue - PART_RES.append(compiled) - return errors - - -def detect_part(stem: str) -> str | None: - """Return part number as string if stem ends with a part suffix, else None. - - XofY (e.g. .2of2) anywhere in the stem takes unconditional priority over a - trailing (N) suffix. A file named 'ENKI-031 [1080p].2of2 (1).mp4' is part 2; - the trailing (1) is a filesystem copy-collision marker (rclone / Windows), - not a part number. Without this pre-check the ordered PART_RES list would - match (1) first and misclassify the file as part 1. - """ - m = _XOFY_PRIORITY_RE.search(stem) - if m: - return m.group(1) - for r in PART_RES: - m = r.search(stem) - if m: - return m.group(1) - return None - - -def part_key(part: str) -> str: - token = str(part or "").strip() - if token.isdigit(): - return str(int(token)) - if len(token) == 1 and token.isalpha(): - return str(ord(token.upper()) - ord("A") + 1) - return token.upper() - - -@dataclass -class FileEntry: - source: str # "Source" (priority) or "Target" - remote: str # the rclone remote:path root supplied - path: str # relative path within remote - size: int - mod_time: str - jav_id: str # normalized, e.g. "SSIS-1" - - @property - def full_path(self) -> str: - sep = "" if self.remote.endswith("/") or not self.path else "/" - return f"{self.remote}{sep}{self.path}" - +# PART_RES is rebound by configure_part_patterns(); always read it dynamically +# from the rcjav.ids module rather than capturing a stale binding at import time. +def _current_part_res(): + return _rcjav_ids.PART_RES def human_size(n: int) -> str: @@ -149,141 +73,6 @@ def human_size(n: int) -> str: return f"{nf:.2f} PiB" -# Matches a trailing lowercase letter variant designator, e.g. the 'z' in IBW-902z. -_VARIANT_SUFFIX_RE = re.compile(r"^(.+?)([a-z])$") - -# Strips `[resolution]` and ` - Actress Name` from a stem so that part-suffix -# patterns anchored at `$` fire correctly. -# Canonical naming: {ID}[-{part}][ - {actress}][ [{resolution}]] -_RESOLUTION_TAG_RE = re.compile(r"\s*\[[^\]]*\]\s*$") - -# Bracket-wrapped ID: [REAL-779] or [HODV-21076] Saki Hatsumi [1080p] -_BRACKET_ID_RE = re.compile(r"^\[([^\]]+)\]") -_RES_LABEL_RE = re.compile(r"\[(?:2160|1080|720|480)p\]", re.IGNORECASE) -_VIDEO_EXTS = { - ".avi", ".flv", ".m2ts", ".m4v", ".mkv", ".mov", ".mp4", ".mpeg", - ".mpg", ".ts", ".webm", ".wmv", -} -_LOWEST_KEEP_PRIORITY_EXTS = {".ts"} - -# No-hyphen ID fallback: MVSD312 → MVSD-312 (letters-only prefix + digits, no hyphen) -_NOHYPHEN_ID_RE = re.compile(r"^([A-Za-z]{2,8})(\d{3,6})") - - -def _clean_stem_for_parts(stem: str) -> str: - """Return stem with trailing [tag] and ' - Actress' stripped. - Resolution is always the last bracketed token; actress follows ' - '.""" - s = _RESOLUTION_TAG_RE.sub("", stem).strip() - if " - " in s: - s = s[:s.index(" - ")].strip() - return s - - -def _part_detection_stems(stem: str) -> list[str]: - """Return stem stages for part detection from least to most cleaned.""" - resolution_clean = _RESOLUTION_TAG_RE.sub("", stem).strip() - actress_clean = _clean_stem_for_parts(stem) - out: list[str] = [] - for candidate in (stem, resolution_clean, actress_clean): - if candidate and candidate not in out: - out.append(candidate) - return out - - -def detect_part_from_stem(stem: str) -> str | None: - """Try part suffix rules before and after metadata cleanup.""" - for candidate in _part_detection_stems(stem): - part = detect_part(candidate) - if part: - return part - return None - - -def extract_id(name: str) -> str | None: - stem = Path(name).stem - - # Strip bracket wrapper: [REAL-779] → REAL-779, [SCOP-297] [1080p] → SCOP-297 - effective_stem = stem - if stem.startswith("["): - bm = _BRACKET_ID_RE.match(stem) - if bm: - effective_stem = bm.group(1).strip() - - m = PRIMARY_ID_RE.match(effective_stem) - if not m: - m = COMPOUND_ID_RE.match(effective_stem) - if not m: - m = FALLBACK_ID_RE.match(effective_stem) - if not m: - # No-hyphen fallback: MVSD312 → MVSD-312 - m = _NOHYPHEN_ID_RE.match(effective_stem) - if not m: - return None - - num = int(m.group(2)) - width = max(3, len(m.group(2))) - prefix = m.group(1).upper() - if prefix == "FC2": - prefix = "FC2-PPV" - - # Check the character immediately after the matched digits. - # Lowercase → variant designator (e.g. IBW-902z): fold into the base ID. - # Uppercase A-D → part letter: handled below by detect_part. - # Anything else (space, '[', end-of-string) → no variant. - after = effective_stem[m.end():m.end() + 1] - variant = after if after.islower() else "" - - base = f"{prefix}-{num:0{width}d}{variant}" - - # Use original stem (not effective_stem) so bracket-wrapped filenames like - # [REAL-779-1].mp4 still get part detection applied to the full stem. - # Run before and after metadata cleanup: raw suffixes such as - # "KV-118 - Actress_PART1" must survive, while trailing [1080p] tags still - # need cleanup before end-anchored detectors can match. - part = detect_part_from_stem(stem) - return f"{base}#part{part_key(part)}" if part else base - - -def normalize_id(raw: str) -> str | None: - return extract_id(raw + ".x") # add dummy ext so stem keeps the ID intact - - -def describe_id_match(display_query: str, matched_query: str, matched_id: str, - expansion_count: int) -> dict[str, str]: - """Explain the matcher path used for one ID hit in JSON output.""" - if "*" in matched_query or "?" in matched_query: - kind, label, confidence = "wildcard", "Wildcard ID", "broad" - elif expansion_count > 1: - kind, label, confidence = "range", "Range member", "expanded" - elif "#part" in matched_query: - kind, label, confidence = "exact_part", "Exact part ID", "high" - elif matched_id.startswith(matched_query + "#part"): - kind, label, confidence = "part", "Base ID + part", "related" - elif display_query.upper() != matched_query.upper(): - kind, label, confidence = "normalized", "Normalized ID", "normalized" - else: - kind, label, confidence = "exact", "Exact ID", "high" - return { - "match_kind": kind, - "match_reason": label, - "match_confidence": confidence, - "matched_query": matched_query, - "matched_id": matched_id, - } - - -def expand_range(raw: str) -> list[str] | None: - """Expand a bracket range like 'IPZZ-[820-860]' into individual ID strings. - Returns None if no range marker present.""" - m = RANGE_RE.search(raw) - if not m: - return None - a, b = int(m.group(1)), int(m.group(2)) - lo, hi = (a, b) if a <= b else (b, a) - width = max(len(m.group(1)), len(m.group(2))) # preserve zero-padding - return [raw[:m.start()] + f"{n:0{width}d}" + raw[m.end():] for n in range(lo, hi + 1)] - - RCLONE_BIN = "rclone" BASIC = False # set by --basic USE_ANSI = True # disabled by --no-color diff --git a/rcjav/__init__.py b/rcjav/__init__.py new file mode 100644 index 0000000..b146b17 --- /dev/null +++ b/rcjav/__init__.py @@ -0,0 +1,24 @@ +"""rcjav — internal package split out of rc-jav.py. + +This file re-exports the names that external callers (tests, fixtures +runner, native messaging host, in-tree code in rc-jav.py) expect to +find at the top level. Adding a new submodule does not change the +public surface — only this file does. +""" +from rcjav.model import FileEntry # noqa: F401 +from rcjav.ids import ( # noqa: F401 + PRIMARY_ID_RE, + FALLBACK_ID_RE, + COMPOUND_ID_RE, + RANGE_RE, + BUILTIN_PART_RES, + PART_RES, + configure_part_patterns, + detect_part, + detect_part_from_stem, + part_key, + extract_id, + normalize_id, + describe_id_match, + expand_range, +) diff --git a/rcjav/ids.py b/rcjav/ids.py new file mode 100644 index 0000000..7e70437 --- /dev/null +++ b/rcjav/ids.py @@ -0,0 +1,243 @@ +"""JAV ID extraction, normalization, and part-suffix detection. + +This is the single source of truth for everything that influences the +`jav_id` field of a FileEntry. Any change here is a "rules" change in +the cache-contract sense (see docs/CACHE_CONTRACT.md in the extension +repo) and bumps `id_rules` once that contract is implemented. + +Cross-side note: the browser extension's content.js mirrors a subset +of this logic (page-title surface, no part suffix). The shared +fixture corpus at fixtures/ pins the cases both sides must agree on. +""" +from __future__ import annotations + +import re +from pathlib import Path +from typing import Iterable + + +PRIMARY_ID_RE = re.compile(r"^([A-Za-z]+)-(\d+)") +FALLBACK_ID_RE = re.compile(r"^([A-Za-z0-9]+)-(\d+)") +COMPOUND_ID_RE = re.compile(r"^([A-Za-z0-9]+(?:-[A-Za-z0-9]+)+)-(\d+)") + +# Part-suffix patterns: anchored at end of stem (after stripping extension). +# Each pattern's group(1) is the part number. +RANGE_RE = re.compile(r"\[(\d+)-(\d+)\]") + +# Non-anchored XofY probe used in detect_part() to resolve the priority conflict +# between a trailing (N) copy-marker suffix and an embedded XofY part indicator. +# Example: "ENKI-031 [1080p].2of2 (1)" — the (1) is a filesystem collision suffix +# (rclone, Windows copy), not a part number; the 2of2 is the real part indicator. +# This pattern intentionally has no end-anchor so it matches anywhere in the stem. +_XOFY_PRIORITY_RE = re.compile(r"[._ -](\d+)\s*of\s*\d+", re.IGNORECASE) + +BUILTIN_PART_RES = [ + re.compile(r"[-_ ](?:pt|part|cd|disc)[-_ ]?(\d+)$", re.IGNORECASE), + re.compile(r"\s*\((\d+)(?:\s*of\s*\d+)?\)$", re.IGNORECASE), + # Exported multipart filenames often end in `.1of2` / `-2 of 4`. + re.compile(r"[._ -](\d+)\s*of\s*\d+$", re.IGNORECASE), + # Bare numeric suffixes (`_N`, ` N`) are only treated as part numbers when + # the number is 1-2 digits. Wider patterns falsely matched resolution tags + # (`_2160`, `_4K2160`) and dates/years (`SSIS-001 2023.mp4` -> `#part2023`), + # corrupting cache keys. + # Staged detection also retries after resolution/actress cleanup, so end + # anchors can match both raw suffixes and metadata-blocked suffixes safely. + re.compile(r"_(\d{1,2})$"), + # Hyphen short-part suffix after the ID, e.g. OFJE-195-1 [480p].mp4. + # Limit to 1-2 digits so the base ID's usual 3+ digit numeric component + # does not make every canonical `ABC-123` filename look multipart. + re.compile(r"-(\d{1,2})$"), + # Lettered parts: separator (hyphen or underscore) followed by A-D. + # Uppercase only — lowercase letters are variant designators (e.g. IBW-902z) + # and are preserved as part of the base ID, not treated as part numbers. + re.compile(r"[-_]([A-D])$"), + # Bare uppercase letter directly after the ID digits with no separator, + # e.g. BAK-052A, BAK-052B. Lookbehind ensures a digit precedes. + re.compile(r"(?<=\d)([A-D])$"), + re.compile(r"\s+(\d{1,2})$"), +] +PART_RES = list(BUILTIN_PART_RES) + + +def configure_part_patterns(patterns: Iterable[str]) -> list[str]: + """Extend part suffix detection with user regexes whose first group is part number.""" + global PART_RES + PART_RES = list(BUILTIN_PART_RES) + errors: list[str] = [] + for pattern in patterns: + pattern = str(pattern or "").strip() + if not pattern: + continue + try: + compiled = re.compile(pattern, re.IGNORECASE) + except re.error as e: + errors.append(f"{pattern!r}: {e}") + continue + if compiled.groups < 1: + errors.append(f"{pattern!r}: needs a capture group for the part number") + continue + PART_RES.append(compiled) + return errors + + +def detect_part(stem: str) -> str | None: + """Return part number as string if stem ends with a part suffix, else None. + + XofY (e.g. .2of2) anywhere in the stem takes unconditional priority over a + trailing (N) suffix. A file named 'ENKI-031 [1080p].2of2 (1).mp4' is part 2; + the trailing (1) is a filesystem copy-collision marker (rclone / Windows), + not a part number. Without this pre-check the ordered PART_RES list would + match (1) first and misclassify the file as part 1. + """ + m = _XOFY_PRIORITY_RE.search(stem) + if m: + return m.group(1) + for r in PART_RES: + m = r.search(stem) + if m: + return m.group(1) + return None + + +def part_key(part: str) -> str: + token = str(part or "").strip() + if token.isdigit(): + return str(int(token)) + if len(token) == 1 and token.isalpha(): + return str(ord(token.upper()) - ord("A") + 1) + return token.upper() + + +# Matches a trailing lowercase letter variant designator, e.g. the 'z' in IBW-902z. +_VARIANT_SUFFIX_RE = re.compile(r"^(.+?)([a-z])$") + +# Strips `[resolution]` and ` - Actress Name` from a stem so that part-suffix +# patterns anchored at `$` fire correctly. +# Canonical naming: {ID}[-{part}][ - {actress}][ [{resolution}]] +_RESOLUTION_TAG_RE = re.compile(r"\s*\[[^\]]*\]\s*$") + +# Bracket-wrapped ID: [REAL-779] or [HODV-21076] Saki Hatsumi [1080p] +_BRACKET_ID_RE = re.compile(r"^\[([^\]]+)\]") +_RES_LABEL_RE = re.compile(r"\[(?:2160|1080|720|480)p\]", re.IGNORECASE) +_VIDEO_EXTS = { + ".avi", ".flv", ".m2ts", ".m4v", ".mkv", ".mov", ".mp4", ".mpeg", + ".mpg", ".ts", ".webm", ".wmv", +} +_LOWEST_KEEP_PRIORITY_EXTS = {".ts"} + +# No-hyphen ID fallback: MVSD312 -> MVSD-312 (letters-only prefix + digits, no hyphen) +_NOHYPHEN_ID_RE = re.compile(r"^([A-Za-z]{2,8})(\d{3,6})") + + +def _clean_stem_for_parts(stem: str) -> str: + """Return stem with trailing [tag] and ' - Actress' stripped. + Resolution is always the last bracketed token; actress follows ' - '.""" + s = _RESOLUTION_TAG_RE.sub("", stem).strip() + if " - " in s: + s = s[:s.index(" - ")].strip() + return s + + +def _part_detection_stems(stem: str) -> list[str]: + """Return stem stages for part detection from least to most cleaned.""" + resolution_clean = _RESOLUTION_TAG_RE.sub("", stem).strip() + actress_clean = _clean_stem_for_parts(stem) + out: list[str] = [] + for candidate in (stem, resolution_clean, actress_clean): + if candidate and candidate not in out: + out.append(candidate) + return out + + +def detect_part_from_stem(stem: str) -> str | None: + """Try part suffix rules before and after metadata cleanup.""" + for candidate in _part_detection_stems(stem): + part = detect_part(candidate) + if part: + return part + return None + + +def extract_id(name: str) -> str | None: + stem = Path(name).stem + + # Strip bracket wrapper: [REAL-779] -> REAL-779, [SCOP-297] [1080p] -> SCOP-297 + effective_stem = stem + if stem.startswith("["): + bm = _BRACKET_ID_RE.match(stem) + if bm: + effective_stem = bm.group(1).strip() + + m = PRIMARY_ID_RE.match(effective_stem) + if not m: + m = COMPOUND_ID_RE.match(effective_stem) + if not m: + m = FALLBACK_ID_RE.match(effective_stem) + if not m: + # No-hyphen fallback: MVSD312 -> MVSD-312 + m = _NOHYPHEN_ID_RE.match(effective_stem) + if not m: + return None + + num = int(m.group(2)) + width = max(3, len(m.group(2))) + prefix = m.group(1).upper() + if prefix == "FC2": + prefix = "FC2-PPV" + + # Check the character immediately after the matched digits. + # Lowercase -> variant designator (e.g. IBW-902z): fold into the base ID. + # Uppercase A-D -> part letter: handled below by detect_part. + # Anything else (space, '[', end-of-string) -> no variant. + after = effective_stem[m.end():m.end() + 1] + variant = after if after.islower() else "" + + base = f"{prefix}-{num:0{width}d}{variant}" + + # Use original stem (not effective_stem) so bracket-wrapped filenames like + # [REAL-779-1].mp4 still get part detection applied to the full stem. + # Run before and after metadata cleanup: raw suffixes such as + # "KV-118 - Actress_PART1" must survive, while trailing [1080p] tags still + # need cleanup before end-anchored detectors can match. + part = detect_part_from_stem(stem) + return f"{base}#part{part_key(part)}" if part else base + + +def normalize_id(raw: str) -> str | None: + return extract_id(raw + ".x") # add dummy ext so stem keeps the ID intact + + +def describe_id_match(display_query: str, matched_query: str, matched_id: str, + expansion_count: int) -> dict[str, str]: + """Explain the matcher path used for one ID hit in JSON output.""" + if "*" in matched_query or "?" in matched_query: + kind, label, confidence = "wildcard", "Wildcard ID", "broad" + elif expansion_count > 1: + kind, label, confidence = "range", "Range member", "expanded" + elif "#part" in matched_query: + kind, label, confidence = "exact_part", "Exact part ID", "high" + elif matched_id.startswith(matched_query + "#part"): + kind, label, confidence = "part", "Base ID + part", "related" + elif display_query.upper() != matched_query.upper(): + kind, label, confidence = "normalized", "Normalized ID", "normalized" + else: + kind, label, confidence = "exact", "Exact ID", "high" + return { + "match_kind": kind, + "match_reason": label, + "match_confidence": confidence, + "matched_query": matched_query, + "matched_id": matched_id, + } + + +def expand_range(raw: str) -> list[str] | None: + """Expand a bracket range like 'IPZZ-[820-860]' into individual ID strings. + Returns None if no range marker present.""" + m = RANGE_RE.search(raw) + if not m: + return None + a, b = int(m.group(1)), int(m.group(2)) + lo, hi = (a, b) if a <= b else (b, a) + width = max(len(m.group(1)), len(m.group(2))) # preserve zero-padding + return [raw[:m.start()] + f"{n:0{width}d}" + raw[m.end():] for n in range(lo, hi + 1)] diff --git a/rcjav/model.py b/rcjav/model.py new file mode 100644 index 0000000..5ec9591 --- /dev/null +++ b/rcjav/model.py @@ -0,0 +1,24 @@ +"""Shared data shapes used by multiple submodules. + +Kept tiny on purpose — only types whose definition is depended on +across module boundaries belong here. Behavior (find_dupes, decide_keep, +extract_id, etc.) lives in the module that owns it. +""" +from __future__ import annotations + +from dataclasses import dataclass + + +@dataclass +class FileEntry: + source: str # "Source" (priority) or "Target" + remote: str # the rclone remote:path root supplied + path: str # relative path within remote + size: int + mod_time: str + jav_id: str # normalized, e.g. "SSIS-1" + + @property + def full_path(self) -> str: + sep = "" if self.remote.endswith("/") or not self.path else "/" + return f"{self.remote}{sep}{self.path}"