/** * Token-grammar classifier for video filenames in a JAVID group. * * Patterns use a simplified token grammar (option A1 from the mockups): * - `{N}` — one or more digits, captured as the part index * - `{L}` — single letter A–Z, captured (A=1, B=2, ...) * - everything else is a literal character * * Patterns match at the END of the filename stem (no extension), * case-insensitive. * * Classification rules for files sharing one normalized JAV code: * - "part" — stem ends with a configured pattern; index is the * captured numeric/letter value. * - "variant" — stem does NOT match any pattern but its prefix * (first dot-segment) equals a stem that DID match. * Variants belong to the matching part. * - "single" — lone file in its code group with no pattern match. * * Tiebreak for "default variant" (the one to play first): the file * whose stem equals the variant_group exactly. Otherwise the * alphabetically first stem in the group. */ export interface CompiledPattern { /** Original token-grammar source. */ source: string; /** Compiled regex anchored to end-of-stem (case-insensitive). */ re: RegExp; /** What the captured token represents. */ kind: "digits" | "letter"; } /** Minimal description of one file presented to the classifier. */ export interface ClassifyInput { /** Stable identifier, opaque to the classifier. */ key: string; /** Filename stem (no extension), as on disk. */ stem: string; } export interface ClassifyResult { key: string; partKind: "part" | "variant" | "single"; /** 1-based sort index for parts; null otherwise. */ partIndex: number | null; /** Stem-with-suffix-stripped — variants share this with their part. */ variantGroup: string | null; } const TOKEN_RE = /\{[NL]\}/g; /** Compile one token-grammar pattern into a regex. Throws on bad token. */ export function compileToken(source: string): CompiledPattern | null { if (!source) return null; // Validate first: only {N} and {L} are allowed; nothing else may use {}. // A bare `{` without a known token is invalid. let kind: "digits" | "letter" | null = null; let body = ""; let i = 0; while (i < source.length) { const c = source[i]!; if (c === "{") { const close = source.indexOf("}", i); if (close < 0) return null; const tok = source.slice(i, close + 1); if (tok === "{N}") { if (kind != null) return null; // only one capture per pattern body += "(\\d+)"; kind = "digits"; } else if (tok === "{L}") { if (kind != null) return null; body += "([A-Za-z])"; kind = "letter"; } else { return null; } i = close + 1; } else { body += c.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); i++; } } if (kind == null) return null; return { source, re: new RegExp(body + "$", "i"), kind, }; } /** Compile a list of patterns; silently drops malformed ones. */ export function compilePatterns(sources: string[]): CompiledPattern[] { const out: CompiledPattern[] = []; for (const s of sources) { const c = compileToken(s); if (c) out.push(c); } return out; } function indexFromCapture(capture: string, kind: "digits" | "letter"): number | null { if (kind === "digits") { const n = Number(capture); return Number.isFinite(n) && n > 0 ? Math.trunc(n) : null; } // Letter: A=1, B=2, ... const code = capture.toUpperCase().charCodeAt(0); if (code < 65 || code > 90) return null; return code - 64; } interface PatternHit { partIndex: number; /** Stem with the matched suffix removed. */ variantGroup: string; } function tryMatch(stem: string, patterns: CompiledPattern[]): PatternHit | null { for (const p of patterns) { const m = stem.match(p.re); if (!m) continue; const idx = indexFromCapture(m[1] ?? "", p.kind); if (idx == null) continue; return { partIndex: idx, variantGroup: stem.slice(0, m.index!), }; } return null; } /** * Classify a group of files that share one normalized JAV code. * * Algorithm: * 1. Try each pattern against each stem; record matches. * 2. Files with no match are candidate variants. A candidate is a * variant of a matched file if its stem's first dot-segment * equals the matched file's variant_group's first dot-segment. * (This catches `XXX-001.fixed.mp4` aligning with `XXX-001-cd1.mp4` * → no, those don't share a dot-prefix; they'd stay singles. But * `XXX-001-cd1.fixed.mp4` would align with `XXX-001-cd1.mp4`.) * 3. If no patterns match anything in the group, all stems share * one variant_group (the longest common prefix of all stems, * trimmed at the last alpha-numeric run); kind = variant for >1 * files, single for 1. */ export function classifyGroup( files: ClassifyInput[], patterns: CompiledPattern[], ): ClassifyResult[] { if (files.length === 0) return []; if (files.length === 1) { const only = files[0]!; return [{ key: only.key, partKind: "single", partIndex: null, variantGroup: null }]; } // Pass 1: pattern match. const hits = new Map(); for (const f of files) { const hit = tryMatch(f.stem, patterns); if (hit) hits.set(f.key, hit); } if (hits.size === 0) { // No part-style suffixes detected anywhere → treat the whole group // as variants of one part. const group = longestCommonPrefix(files.map((f) => f.stem)); return files.map((f) => ({ key: f.key, partKind: "variant" as const, partIndex: null, variantGroup: group || f.stem, })); } // Pass 2: attach unmatched stems to the matched stem they extend. // A non-matching stem `S` is a variant of part group `G` iff `S` // starts with `G + "."` (i.e. `G` followed by a dot — the typical // "alt encode" suffix shape: `XXX-001-cd1.fixed.mp4`). const matchedGroupKeys = Array.from(new Set(Array.from(hits.values()).map((h) => h.variantGroup))); // Sort by length desc so longer (more specific) groups bind first. matchedGroupKeys.sort((a, b) => b.length - a.length); const out: ClassifyResult[] = []; for (const f of files) { const hit = hits.get(f.key); if (hit) { out.push({ key: f.key, partKind: "part", partIndex: hit.partIndex, variantGroup: hit.variantGroup, }); continue; } // Unmatched: try to attach to a part group via dot-prefix. const attached = matchedGroupKeys.find( (g) => g && (f.stem === g || f.stem.startsWith(g + ".")), ); if (attached) { out.push({ key: f.key, partKind: "variant", partIndex: null, variantGroup: attached }); } else { // No way to attach — the file is a stray. Mark single. out.push({ key: f.key, partKind: "single", partIndex: null, variantGroup: null }); } } return out; } function longestCommonPrefix(strs: string[]): string { if (strs.length === 0) return ""; let prefix = strs[0]!; for (let i = 1; i < strs.length; i++) { const s = strs[i]!; let j = 0; while (j < prefix.length && j < s.length && prefix[j] === s[j]) j++; prefix = prefix.slice(0, j); if (!prefix) return ""; } // Trim trailing punctuation so we don't end on a half-word like "XXX-001.". return prefix.replace(/[\s._\-]+$/, ""); } /** * From a set of files all sharing the same variantGroup, pick the one * to play by default. Rule: stem === group exactly; else alphabetically * first. */ export function pickDefaultVariant( variants: T[], group: string, ): T | null { if (variants.length === 0) return null; const exact = variants.find((v) => v.stem === group); if (exact) return exact; return [...variants].sort((a, b) => a.stem.localeCompare(b.stem))[0] ?? null; } /** * Compute a short label for a variant relative to its group stem. * `XXX-001.fixed` with group `XXX-001` → `fixed`. * Falls back to `original` for the default / matching stem. */ export function variantLabel(stem: string, group: string): string { if (stem === group) return "original"; if (stem.startsWith(group + ".")) { return stem.slice(group.length + 1) || "original"; } if (stem.startsWith(group)) { return stem.slice(group.length).replace(/^[._\-\s]+/, "") || "original"; } return stem; }