Initial commit

2026-05-26 22:46:00 +02:00
commit 7e2c2ff89c
256 changed files with 51523 additions and 0 deletions
@@ -0,0 +1,254 @@
+/**
+ * Token-grammar classifier for video filenames in a JAVID group.
+ *
+ * Patterns use a simplified token grammar (option A1 from the mockups):
+ *   - `{N}` — one or more digits, captured as the part index
+ *   - `{L}` — single letter A–Z, captured (A=1, B=2, ...)
+ *   - everything else is a literal character
+ *
+ * Patterns match at the END of the filename stem (no extension),
+ * case-insensitive.
+ *
+ * Classification rules for files sharing one normalized JAV code:
+ *   - "part"    — stem ends with a configured pattern; index is the
+ *                 captured numeric/letter value.
+ *   - "variant" — stem does NOT match any pattern but its prefix
+ *                 (first dot-segment) equals a stem that DID match.
+ *                 Variants belong to the matching part.
+ *   - "single"  — lone file in its code group with no pattern match.
+ *
+ * Tiebreak for "default variant" (the one to play first): the file
+ * whose stem equals the variant_group exactly. Otherwise the
+ * alphabetically first stem in the group.
+ */
+export interface CompiledPattern {
+  /** Original token-grammar source. */
+  source: string;
+  /** Compiled regex anchored to end-of-stem (case-insensitive). */
+  re: RegExp;
+  /** What the captured token represents. */
+  kind: "digits" | "letter";
+}
+
+/** Minimal description of one file presented to the classifier. */
+export interface ClassifyInput {
+  /** Stable identifier, opaque to the classifier. */
+  key: string;
+  /** Filename stem (no extension), as on disk. */
+  stem: string;
+}
+
+export interface ClassifyResult {
+  key: string;
+  partKind: "part" | "variant" | "single";
+  /** 1-based sort index for parts; null otherwise. */
+  partIndex: number | null;
+  /** Stem-with-suffix-stripped — variants share this with their part. */
+  variantGroup: string | null;
+}
+
+const TOKEN_RE = /\{[NL]\}/g;
+
+/** Compile one token-grammar pattern into a regex. Throws on bad token. */
+export function compileToken(source: string): CompiledPattern | null {
+  if (!source) return null;
+  // Validate first: only {N} and {L} are allowed; nothing else may use {}.
+  // A bare `{` without a known token is invalid.
+  let kind: "digits" | "letter" | null = null;
+  let body = "";
+  let i = 0;
+  while (i < source.length) {
+    const c = source[i]!;
+    if (c === "{") {
+      const close = source.indexOf("}", i);
+      if (close < 0) return null;
+      const tok = source.slice(i, close + 1);
+      if (tok === "{N}") {
+        if (kind != null) return null; // only one capture per pattern
+        body += "(\\d+)";
+        kind = "digits";
+      } else if (tok === "{L}") {
+        if (kind != null) return null;
+        body += "([A-Za-z])";
+        kind = "letter";
+      } else {
+        return null;
+      }
+      i = close + 1;
+    } else {
+      body += c.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+      i++;
+    }
+  }
+  if (kind == null) return null;
+  return {
+    source,
+    re: new RegExp(body + "$", "i"),
+    kind,
+  };
+}
+
+/** Compile a list of patterns; silently drops malformed ones. */
+export function compilePatterns(sources: string[]): CompiledPattern[] {
+  const out: CompiledPattern[] = [];
+  for (const s of sources) {
+    const c = compileToken(s);
+    if (c) out.push(c);
+  }
+  return out;
+}
+
+function indexFromCapture(capture: string, kind: "digits" | "letter"): number | null {
+  if (kind === "digits") {
+    const n = Number(capture);
+    return Number.isFinite(n) && n > 0 ? Math.trunc(n) : null;
+  }
+  // Letter: A=1, B=2, ...
+  const code = capture.toUpperCase().charCodeAt(0);
+  if (code < 65 || code > 90) return null;
+  return code - 64;
+}
+
+interface PatternHit {
+  partIndex: number;
+  /** Stem with the matched suffix removed. */
+  variantGroup: string;
+}
+
+function tryMatch(stem: string, patterns: CompiledPattern[]): PatternHit | null {
+  for (const p of patterns) {
+    const m = stem.match(p.re);
+    if (!m) continue;
+    const idx = indexFromCapture(m[1] ?? "", p.kind);
+    if (idx == null) continue;
+    return {
+      partIndex: idx,
+      variantGroup: stem.slice(0, m.index!),
+    };
+  }
+  return null;
+}
+
+/**
+ * Classify a group of files that share one normalized JAV code.
+ *
+ * Algorithm:
+ *   1. Try each pattern against each stem; record matches.
+ *   2. Files with no match are candidate variants. A candidate is a
+ *      variant of a matched file if its stem's first dot-segment
+ *      equals the matched file's variant_group's first dot-segment.
+ *      (This catches `XXX-001.fixed.mp4` aligning with `XXX-001-cd1.mp4`
+ *      → no, those don't share a dot-prefix; they'd stay singles. But
+ *      `XXX-001-cd1.fixed.mp4` would align with `XXX-001-cd1.mp4`.)
+ *   3. If no patterns match anything in the group, all stems share
+ *      one variant_group (the longest common prefix of all stems,
+ *      trimmed at the last alpha-numeric run); kind = variant for >1
+ *      files, single for 1.
+ */
+export function classifyGroup(
+  files: ClassifyInput[],
+  patterns: CompiledPattern[],
+): ClassifyResult[] {
+  if (files.length === 0) return [];
+  if (files.length === 1) {
+    const only = files[0]!;
+    return [{ key: only.key, partKind: "single", partIndex: null, variantGroup: null }];
+  }
+
+  // Pass 1: pattern match.
+  const hits = new Map<string, PatternHit>();
+  for (const f of files) {
+    const hit = tryMatch(f.stem, patterns);
+    if (hit) hits.set(f.key, hit);
+  }
+
+  if (hits.size === 0) {
+    // No part-style suffixes detected anywhere → treat the whole group
+    // as variants of one part.
+    const group = longestCommonPrefix(files.map((f) => f.stem));
+    return files.map((f) => ({
+      key: f.key,
+      partKind: "variant" as const,
+      partIndex: null,
+      variantGroup: group || f.stem,
+    }));
+  }
+
+  // Pass 2: attach unmatched stems to the matched stem they extend.
+  // A non-matching stem `S` is a variant of part group `G` iff `S`
+  // starts with `G + "."` (i.e. `G` followed by a dot — the typical
+  // "alt encode" suffix shape: `XXX-001-cd1.fixed.mp4`).
+  const matchedGroupKeys = Array.from(new Set(Array.from(hits.values()).map((h) => h.variantGroup)));
+  // Sort by length desc so longer (more specific) groups bind first.
+  matchedGroupKeys.sort((a, b) => b.length - a.length);
+
+  const out: ClassifyResult[] = [];
+  for (const f of files) {
+    const hit = hits.get(f.key);
+    if (hit) {
+      out.push({
+        key: f.key,
+        partKind: "part",
+        partIndex: hit.partIndex,
+        variantGroup: hit.variantGroup,
+      });
+      continue;
+    }
+    // Unmatched: try to attach to a part group via dot-prefix.
+    const attached = matchedGroupKeys.find(
+      (g) => g && (f.stem === g || f.stem.startsWith(g + ".")),
+    );
+    if (attached) {
+      out.push({ key: f.key, partKind: "variant", partIndex: null, variantGroup: attached });
+    } else {
+      // No way to attach — the file is a stray. Mark single.
+      out.push({ key: f.key, partKind: "single", partIndex: null, variantGroup: null });
+    }
+  }
+  return out;
+}
+
+function longestCommonPrefix(strs: string[]): string {
+  if (strs.length === 0) return "";
+  let prefix = strs[0]!;
+  for (let i = 1; i < strs.length; i++) {
+    const s = strs[i]!;
+    let j = 0;
+    while (j < prefix.length && j < s.length && prefix[j] === s[j]) j++;
+    prefix = prefix.slice(0, j);
+    if (!prefix) return "";
+  }
+  // Trim trailing punctuation so we don't end on a half-word like "XXX-001.".
+  return prefix.replace(/[\s._\-]+$/, "");
+}
+
+/**
+ * From a set of files all sharing the same variantGroup, pick the one
+ * to play by default. Rule: stem === group exactly; else alphabetically
+ * first.
+ */
+export function pickDefaultVariant<T extends { stem: string }>(
+  variants: T[],
+  group: string,
+): T | null {
+  if (variants.length === 0) return null;
+  const exact = variants.find((v) => v.stem === group);
+  if (exact) return exact;
+  return [...variants].sort((a, b) => a.stem.localeCompare(b.stem))[0] ?? null;
+}
+
+/**
+ * Compute a short label for a variant relative to its group stem.
+ * `XXX-001.fixed` with group `XXX-001` → `fixed`.
+ * Falls back to `original` for the default / matching stem.
+ */
+export function variantLabel(stem: string, group: string): string {
+  if (stem === group) return "original";
+  if (stem.startsWith(group + ".")) {
+    return stem.slice(group.length + 1) || "original";
+  }
+  if (stem.startsWith(group)) {
+    return stem.slice(group.length).replace(/^[._\-\s]+/, "") || "original";
+  }
+  return stem;
+}