rclone-jav/rcjav/ids.py

"""JAV ID extraction, normalization, and part-suffix detection.

This is the single source of truth for everything that influences the
`jav_id` field of a FileEntry. Any change here is a "rules" change in
the cache-contract sense (see docs/CACHE_CONTRACT.md in the extension
repo) and bumps `id_rules` once that contract is implemented.

Cross-side note: the browser extension's content.js mirrors a subset
of this logic (page-title surface, no part suffix). The shared
fixture corpus at fixtures/ pins the cases both sides must agree on.
"""
from __future__ import annotations

import re
from pathlib import Path
from typing import Iterable


PRIMARY_ID_RE = re.compile(r"^([A-Za-z]+)-(\d+)")
FALLBACK_ID_RE = re.compile(r"^([A-Za-z0-9]+)-(\d+)")
COMPOUND_ID_RE = re.compile(r"^([A-Za-z0-9]+(?:-[A-Za-z0-9]+)+)-(\d+)")

# Part-suffix patterns: anchored at end of stem (after stripping extension).
# Each pattern's group(1) is the part number.
RANGE_RE = re.compile(r"\[(\d+)-(\d+)\]")

# Non-anchored XofY probe used in detect_part() to resolve the priority conflict
# between a trailing (N) copy-marker suffix and an embedded XofY part indicator.
# Example: "ENKI-031 [1080p].2of2 (1)" — the (1) is a filesystem collision suffix
# (rclone, Windows copy), not a part number; the 2of2 is the real part indicator.
# This pattern intentionally has no end-anchor so it matches anywhere in the stem.
_XOFY_PRIORITY_RE = re.compile(r"[._ -](\d+)\s*of\s*\d+", re.IGNORECASE)

BUILTIN_PART_RES = [
    re.compile(r"[-_ ](?:pt|part|cd|disc)[-_ ]?(\d+)$", re.IGNORECASE),
    re.compile(r"\s*\((\d+)(?:\s*of\s*\d+)?\)$", re.IGNORECASE),
    # Exported multipart filenames often end in `.1of2` / `-2 of 4`.
    re.compile(r"[._ -](\d+)\s*of\s*\d+$", re.IGNORECASE),
    # Bare numeric suffixes (`_N`, ` N`) are only treated as part numbers when
    # the number is 1-2 digits. Wider patterns falsely matched resolution tags
    # (`_2160`, `_4K2160`) and dates/years (`SSIS-001 2023.mp4` -> `#part2023`),
    # corrupting cache keys.
    # Staged detection also retries after resolution/actress cleanup, so end
    # anchors can match both raw suffixes and metadata-blocked suffixes safely.
    re.compile(r"_(\d{1,2})$"),
    # Hyphen short-part suffix after the ID, e.g. OFJE-195-1 [480p].mp4.
    # Limit to 1-2 digits so the base ID's usual 3+ digit numeric component
    # does not make every canonical `ABC-123` filename look multipart.
    re.compile(r"-(\d{1,2})$"),
    # Lettered parts: separator (hyphen or underscore) followed by A-D.
    # Uppercase only — lowercase letters are variant designators (e.g. IBW-902z)
    # and are preserved as part of the base ID, not treated as part numbers.
    re.compile(r"[-_]([A-D])$"),
    # Bare uppercase letter directly after the ID digits with no separator,
    # e.g. BAK-052A, BAK-052B.  Lookbehind ensures a digit precedes.
    re.compile(r"(?<=\d)([A-D])$"),
    re.compile(r"\s+(\d{1,2})$"),
]
PART_RES = list(BUILTIN_PART_RES)


def configure_part_patterns(patterns: Iterable[str]) -> list[str]:
    """Extend part suffix detection with user regexes whose first group is part number."""
    global PART_RES
    PART_RES = list(BUILTIN_PART_RES)
    errors: list[str] = []
    for pattern in patterns:
        pattern = str(pattern or "").strip()
        if not pattern:
            continue
        try:
            compiled = re.compile(pattern, re.IGNORECASE)
        except re.error as e:
            errors.append(f"{pattern!r}: {e}")
            continue
        if compiled.groups < 1:
            errors.append(f"{pattern!r}: needs a capture group for the part number")
            continue
        PART_RES.append(compiled)
    return errors


def detect_part(stem: str) -> str | None:
    """Return part number as string if stem ends with a part suffix, else None.

    XofY (e.g. .2of2) anywhere in the stem takes unconditional priority over a
    trailing (N) suffix.  A file named 'ENKI-031 [1080p].2of2 (1).mp4' is part 2;
    the trailing (1) is a filesystem copy-collision marker (rclone / Windows),
    not a part number.  Without this pre-check the ordered PART_RES list would
    match (1) first and misclassify the file as part 1.
    """
    m = _XOFY_PRIORITY_RE.search(stem)
    if m:
        return m.group(1)
    for r in PART_RES:
        m = r.search(stem)
        if m:
            return m.group(1)
    return None


def part_key(part: str) -> str:
    token = str(part or "").strip()
    if token.isdigit():
        return str(int(token))
    if len(token) == 1 and token.isalpha():
        return str(ord(token.upper()) - ord("A") + 1)
    return token.upper()


# Matches a trailing lowercase letter variant designator, e.g. the 'z' in IBW-902z.
_VARIANT_SUFFIX_RE = re.compile(r"^(.+?)([a-z])$")

# Strips `[resolution]` and ` - Actress Name` from a stem so that part-suffix
# patterns anchored at `$` fire correctly.
# Canonical naming: {ID}[-{part}][ - {actress}][ [{resolution}]]
_RESOLUTION_TAG_RE = re.compile(r"\s*\[[^\]]*\]\s*$")

# Bracket-wrapped ID: [REAL-779] or [HODV-21076] Saki Hatsumi [1080p]
_BRACKET_ID_RE = re.compile(r"^\[([^\]]+)\]")
_RES_LABEL_RE = re.compile(r"\[(?:2160|1080|720|480)p\]", re.IGNORECASE)
_VIDEO_EXTS = {
    ".avi", ".flv", ".m2ts", ".m4v", ".mkv", ".mov", ".mp4", ".mpeg",
    ".mpg", ".ts", ".webm", ".wmv",
}
_LOWEST_KEEP_PRIORITY_EXTS = {".ts"}

# No-hyphen ID fallback: MVSD312 -> MVSD-312 (letters-only prefix + digits, no hyphen)
_NOHYPHEN_ID_RE = re.compile(r"^([A-Za-z]{2,8})(\d{3,6})")


def _clean_stem_for_parts(stem: str) -> str:
    """Return stem with trailing [tag] and ' - Actress' stripped.
    Resolution is always the last bracketed token; actress follows ' - '."""
    s = _RESOLUTION_TAG_RE.sub("", stem).strip()
    if " - " in s:
        s = s[:s.index(" - ")].strip()
    return s


def _part_detection_stems(stem: str) -> list[str]:
    """Return stem stages for part detection from least to most cleaned."""
    resolution_clean = _RESOLUTION_TAG_RE.sub("", stem).strip()
    actress_clean = _clean_stem_for_parts(stem)
    out: list[str] = []
    for candidate in (stem, resolution_clean, actress_clean):
        if candidate and candidate not in out:
            out.append(candidate)
    return out


def detect_part_from_stem(stem: str) -> str | None:
    """Try part suffix rules before and after metadata cleanup."""
    for candidate in _part_detection_stems(stem):
        part = detect_part(candidate)
        if part:
            return part
    return None


def extract_id(name: str) -> str | None:
    stem = Path(name).stem

    # Strip bracket wrapper: [REAL-779] -> REAL-779, [SCOP-297] [1080p] -> SCOP-297
    effective_stem = stem
    if stem.startswith("["):
        bm = _BRACKET_ID_RE.match(stem)
        if bm:
            effective_stem = bm.group(1).strip()

    m = PRIMARY_ID_RE.match(effective_stem)
    if not m:
        m = COMPOUND_ID_RE.match(effective_stem)
    if not m:
        m = FALLBACK_ID_RE.match(effective_stem)
    if not m:
        # No-hyphen fallback: MVSD312 -> MVSD-312
        m = _NOHYPHEN_ID_RE.match(effective_stem)
    if not m:
        return None

    num = int(m.group(2))
    width = max(3, len(m.group(2)))
    prefix = m.group(1).upper()
    if prefix == "FC2":
        prefix = "FC2-PPV"

    # Check the character immediately after the matched digits.
    # Lowercase -> variant designator (e.g. IBW-902z): fold into the base ID.
    # Uppercase A-D -> part letter: handled below by detect_part.
    # Anything else (space, '[', end-of-string) -> no variant.
    after = effective_stem[m.end():m.end() + 1]
    variant = after if after.islower() else ""

    base = f"{prefix}-{num:0{width}d}{variant}"

    # Use original stem (not effective_stem) so bracket-wrapped filenames like
    # [REAL-779-1].mp4 still get part detection applied to the full stem.
    # Run before and after metadata cleanup: raw suffixes such as
    # "KV-118 - Actress_PART1" must survive, while trailing [1080p] tags still
    # need cleanup before end-anchored detectors can match.
    part = detect_part_from_stem(stem)
    return f"{base}#part{part_key(part)}" if part else base


def normalize_id(raw: str) -> str | None:
    return extract_id(raw + ".x")  # add dummy ext so stem keeps the ID intact


def describe_id_match(display_query: str, matched_query: str, matched_id: str,
                      expansion_count: int) -> dict[str, str]:
    """Explain the matcher path used for one ID hit in JSON output."""
    if "*" in matched_query or "?" in matched_query:
        kind, label, confidence = "wildcard", "Wildcard ID", "broad"
    elif expansion_count > 1:
        kind, label, confidence = "range", "Range member", "expanded"
    elif "#part" in matched_query:
        kind, label, confidence = "exact_part", "Exact part ID", "high"
    elif matched_id.startswith(matched_query + "#part"):
        kind, label, confidence = "part", "Base ID + part", "related"
    elif display_query.upper() != matched_query.upper():
        kind, label, confidence = "normalized", "Normalized ID", "normalized"
    else:
        kind, label, confidence = "exact", "Exact ID", "high"
    return {
        "match_kind": kind,
        "match_reason": label,
        "match_confidence": confidence,
        "matched_query": matched_query,
        "matched_id": matched_id,
    }


def current_rules_signature() -> str:
    """Sha256 over the canonical text of every rule that influences a jav_id.

    Includes built-in regex sources, BUILTIN_PART_RES sources, and PART_RES
    (which captures user-added part patterns applied by
    `configure_part_patterns`). Output prefixed with `sha256:` so callers can
    sniff the algorithm without re-deriving it.

    Stable across invocations: dict is dumped with sort_keys=True. Bumping a
    regex changes the digest; reordering BUILTIN_PART_RES also changes it
    (order is part of the contract because part-detection short-circuits).
    """
    import hashlib
    import json as _json
    data = {
        "schema": 1,  # bump when this signature schema itself changes
        "primary": PRIMARY_ID_RE.pattern,
        "compound": COMPOUND_ID_RE.pattern,
        "fallback": FALLBACK_ID_RE.pattern,
        "nohyphen": _NOHYPHEN_ID_RE.pattern,
        "bracket": _BRACKET_ID_RE.pattern,
        "variant": _VARIANT_SUFFIX_RE.pattern,
        "xofy": _XOFY_PRIORITY_RE.pattern,
        "resolution_tag": _RESOLUTION_TAG_RE.pattern,
        "builtin_part_res": [r.pattern for r in BUILTIN_PART_RES],
        "part_res": [r.pattern for r in PART_RES],
        "fc2_handling": "fc2_to_ppv",
    }
    text = _json.dumps(data, sort_keys=True, ensure_ascii=False)
    return "sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest()


def expand_range(raw: str) -> list[str] | None:
    """Expand a bracket range like 'IPZZ-[820-860]' into individual ID strings.
    Returns None if no range marker present."""
    m = RANGE_RE.search(raw)
    if not m:
        return None
    a, b = int(m.group(1)), int(m.group(2))
    lo, hi = (a, b) if a <= b else (b, a)
    width = max(len(m.group(1)), len(m.group(2)))  # preserve zero-padding
    return [raw[:m.start()] + f"{n:0{width}d}" + raw[m.end():] for n in range(lo, hi + 1)]