Files
pinkudex/lib/video/partClassify.ts
T
2026-05-26 22:46:00 +02:00

255 lines
8.3 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Token-grammar classifier for video filenames in a JAVID group.
*
* Patterns use a simplified token grammar (option A1 from the mockups):
* - `{N}` — one or more digits, captured as the part index
* - `{L}` — single letter AZ, captured (A=1, B=2, ...)
* - everything else is a literal character
*
* Patterns match at the END of the filename stem (no extension),
* case-insensitive.
*
* Classification rules for files sharing one normalized JAV code:
* - "part" — stem ends with a configured pattern; index is the
* captured numeric/letter value.
* - "variant" — stem does NOT match any pattern but its prefix
* (first dot-segment) equals a stem that DID match.
* Variants belong to the matching part.
* - "single" — lone file in its code group with no pattern match.
*
* Tiebreak for "default variant" (the one to play first): the file
* whose stem equals the variant_group exactly. Otherwise the
* alphabetically first stem in the group.
*/
export interface CompiledPattern {
/** Original token-grammar source. */
source: string;
/** Compiled regex anchored to end-of-stem (case-insensitive). */
re: RegExp;
/** What the captured token represents. */
kind: "digits" | "letter";
}
/** Minimal description of one file presented to the classifier. */
export interface ClassifyInput {
/** Stable identifier, opaque to the classifier. */
key: string;
/** Filename stem (no extension), as on disk. */
stem: string;
}
export interface ClassifyResult {
key: string;
partKind: "part" | "variant" | "single";
/** 1-based sort index for parts; null otherwise. */
partIndex: number | null;
/** Stem-with-suffix-stripped — variants share this with their part. */
variantGroup: string | null;
}
const TOKEN_RE = /\{[NL]\}/g;
/** Compile one token-grammar pattern into a regex. Throws on bad token. */
export function compileToken(source: string): CompiledPattern | null {
if (!source) return null;
// Validate first: only {N} and {L} are allowed; nothing else may use {}.
// A bare `{` without a known token is invalid.
let kind: "digits" | "letter" | null = null;
let body = "";
let i = 0;
while (i < source.length) {
const c = source[i]!;
if (c === "{") {
const close = source.indexOf("}", i);
if (close < 0) return null;
const tok = source.slice(i, close + 1);
if (tok === "{N}") {
if (kind != null) return null; // only one capture per pattern
body += "(\\d+)";
kind = "digits";
} else if (tok === "{L}") {
if (kind != null) return null;
body += "([A-Za-z])";
kind = "letter";
} else {
return null;
}
i = close + 1;
} else {
body += c.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
i++;
}
}
if (kind == null) return null;
return {
source,
re: new RegExp(body + "$", "i"),
kind,
};
}
/** Compile a list of patterns; silently drops malformed ones. */
export function compilePatterns(sources: string[]): CompiledPattern[] {
const out: CompiledPattern[] = [];
for (const s of sources) {
const c = compileToken(s);
if (c) out.push(c);
}
return out;
}
function indexFromCapture(capture: string, kind: "digits" | "letter"): number | null {
if (kind === "digits") {
const n = Number(capture);
return Number.isFinite(n) && n > 0 ? Math.trunc(n) : null;
}
// Letter: A=1, B=2, ...
const code = capture.toUpperCase().charCodeAt(0);
if (code < 65 || code > 90) return null;
return code - 64;
}
interface PatternHit {
partIndex: number;
/** Stem with the matched suffix removed. */
variantGroup: string;
}
function tryMatch(stem: string, patterns: CompiledPattern[]): PatternHit | null {
for (const p of patterns) {
const m = stem.match(p.re);
if (!m) continue;
const idx = indexFromCapture(m[1] ?? "", p.kind);
if (idx == null) continue;
return {
partIndex: idx,
variantGroup: stem.slice(0, m.index!),
};
}
return null;
}
/**
* Classify a group of files that share one normalized JAV code.
*
* Algorithm:
* 1. Try each pattern against each stem; record matches.
* 2. Files with no match are candidate variants. A candidate is a
* variant of a matched file if its stem's first dot-segment
* equals the matched file's variant_group's first dot-segment.
* (This catches `XXX-001.fixed.mp4` aligning with `XXX-001-cd1.mp4`
* → no, those don't share a dot-prefix; they'd stay singles. But
* `XXX-001-cd1.fixed.mp4` would align with `XXX-001-cd1.mp4`.)
* 3. If no patterns match anything in the group, all stems share
* one variant_group (the longest common prefix of all stems,
* trimmed at the last alpha-numeric run); kind = variant for >1
* files, single for 1.
*/
export function classifyGroup(
files: ClassifyInput[],
patterns: CompiledPattern[],
): ClassifyResult[] {
if (files.length === 0) return [];
if (files.length === 1) {
const only = files[0]!;
return [{ key: only.key, partKind: "single", partIndex: null, variantGroup: null }];
}
// Pass 1: pattern match.
const hits = new Map<string, PatternHit>();
for (const f of files) {
const hit = tryMatch(f.stem, patterns);
if (hit) hits.set(f.key, hit);
}
if (hits.size === 0) {
// No part-style suffixes detected anywhere → treat the whole group
// as variants of one part.
const group = longestCommonPrefix(files.map((f) => f.stem));
return files.map((f) => ({
key: f.key,
partKind: "variant" as const,
partIndex: null,
variantGroup: group || f.stem,
}));
}
// Pass 2: attach unmatched stems to the matched stem they extend.
// A non-matching stem `S` is a variant of part group `G` iff `S`
// starts with `G + "."` (i.e. `G` followed by a dot — the typical
// "alt encode" suffix shape: `XXX-001-cd1.fixed.mp4`).
const matchedGroupKeys = Array.from(new Set(Array.from(hits.values()).map((h) => h.variantGroup)));
// Sort by length desc so longer (more specific) groups bind first.
matchedGroupKeys.sort((a, b) => b.length - a.length);
const out: ClassifyResult[] = [];
for (const f of files) {
const hit = hits.get(f.key);
if (hit) {
out.push({
key: f.key,
partKind: "part",
partIndex: hit.partIndex,
variantGroup: hit.variantGroup,
});
continue;
}
// Unmatched: try to attach to a part group via dot-prefix.
const attached = matchedGroupKeys.find(
(g) => g && (f.stem === g || f.stem.startsWith(g + ".")),
);
if (attached) {
out.push({ key: f.key, partKind: "variant", partIndex: null, variantGroup: attached });
} else {
// No way to attach — the file is a stray. Mark single.
out.push({ key: f.key, partKind: "single", partIndex: null, variantGroup: null });
}
}
return out;
}
function longestCommonPrefix(strs: string[]): string {
if (strs.length === 0) return "";
let prefix = strs[0]!;
for (let i = 1; i < strs.length; i++) {
const s = strs[i]!;
let j = 0;
while (j < prefix.length && j < s.length && prefix[j] === s[j]) j++;
prefix = prefix.slice(0, j);
if (!prefix) return "";
}
// Trim trailing punctuation so we don't end on a half-word like "XXX-001.".
return prefix.replace(/[\s._\-]+$/, "");
}
/**
* From a set of files all sharing the same variantGroup, pick the one
* to play by default. Rule: stem === group exactly; else alphabetically
* first.
*/
export function pickDefaultVariant<T extends { stem: string }>(
variants: T[],
group: string,
): T | null {
if (variants.length === 0) return null;
const exact = variants.find((v) => v.stem === group);
if (exact) return exact;
return [...variants].sort((a, b) => a.stem.localeCompare(b.stem))[0] ?? null;
}
/**
* Compute a short label for a variant relative to its group stem.
* `XXX-001.fixed` with group `XXX-001` → `fixed`.
* Falls back to `original` for the default / matching stem.
*/
export function variantLabel(stem: string, group: string): string {
if (stem === group) return "original";
if (stem.startsWith(group + ".")) {
return stem.slice(group.length + 1) || "original";
}
if (stem.startsWith(group)) {
return stem.slice(group.length).replace(/^[._\-\s]+/, "") || "original";
}
return stem;
}