Initial commit

2026-05-26 22:46:00 +02:00
commit 7e2c2ff89c
256 changed files with 51523 additions and 0 deletions
@@ -0,0 +1,195 @@
+import "server-only";
+import path from "node:path";
+import fs from "node:fs/promises";
+import iconv from "iconv-lite";
+
+export const SUBTITLE_EXTS = [".srt", ".vtt", ".ass", ".ssa"] as const;
+export type SubtitleExt = (typeof SUBTITLE_EXTS)[number];
+
+const SUBTITLE_EXT_SET = new Set<string>(SUBTITLE_EXTS);
+
+export type LangIso = "eng" | "zho" | "jpn";
+export type LangPref = "EN" | "CN" | "JP" | "off";
+
+export interface SubtitleFileEntry {
+  abs: string;
+  filename: string;
+}
+
+export async function walkSubtitles(root: string, maxDepth = 2): Promise<SubtitleFileEntry[]> {
+  const out: SubtitleFileEntry[] = [];
+  type Frame = { dir: string; depth: number };
+  const stack: Frame[] = [{ dir: root, depth: 0 }];
+  while (stack.length) {
+    const { dir, depth } = stack.pop()!;
+    let entries: import("node:fs").Dirent[];
+    try {
+      entries = await fs.readdir(dir, { withFileTypes: true });
+    } catch {
+      continue;
+    }
+    for (const e of entries) {
+      const full = path.join(dir, e.name);
+      if (e.isDirectory()) {
+        if (depth < maxDepth) stack.push({ dir: full, depth: depth + 1 });
+      } else if (e.isFile()) {
+        const ext = path.extname(e.name).toLowerCase();
+        if (SUBTITLE_EXT_SET.has(ext)) out.push({ abs: full, filename: e.name });
+      }
+    }
+  }
+  return out;
+}
+
+const PREF_TO_ISO: Record<Exclude<LangPref, "off">, LangIso> = {
+  EN: "eng",
+  CN: "zho",
+  JP: "jpn",
+};
+
+const ISO_TO_PREF: Record<LangIso, Exclude<LangPref, "off">> = {
+  eng: "EN",
+  zho: "CN",
+  jpn: "JP",
+};
+
+export function isoFromPref(pref: LangPref): LangIso | null {
+  return pref === "off" ? null : PREF_TO_ISO[pref];
+}
+
+export function prefFromIso(iso: LangIso | null): LangPref {
+  return iso == null ? "off" : ISO_TO_PREF[iso];
+}
+
+const ENGLISH_TOKENS = new Set(["en", "eng", "english"]);
+const CHINESE_TOKENS = new Set([
+  "zh", "zho", "chi", "chs", "cht", "chn", "cn", "chinese",
+  "schinese", "tchinese", "simplified", "traditional",
+  "zh-cn", "zh-tw", "zh-hans", "zh-hant",
+]);
+const JAPANESE_TOKENS = new Set(["ja", "jp", "jpn", "japanese", "jap"]);
+
+export function normalizeLanguageTag(tag: string | null | undefined): LangIso | null {
+  if (!tag) return null;
+  const lower = tag.trim().toLowerCase();
+  if (!lower) return null;
+  if (ENGLISH_TOKENS.has(lower)) return "eng";
+  if (CHINESE_TOKENS.has(lower)) return "zho";
+  if (JAPANESE_TOKENS.has(lower)) return "jpn";
+  return null;
+}
+
+export function languageDisplay(iso: LangIso | null): string {
+  if (iso === "eng") return "English";
+  if (iso === "zho") return "Chinese";
+  if (iso === "jpn") return "Japanese";
+  return "Unknown";
+}
+
+const TOKEN_SPLIT = /[\s._\-\[\]()+,;]+/g;
+
+export interface DetectedLanguage {
+  /** Single ISO code if exactly one language was detected. */
+  lang: LangIso | null;
+  /** Display label — "English", "Chinese", "English/Chinese", "Unknown". */
+  label: string;
+}
+
+/** Inspect a filename's stem for embedded language hints. Multiple hits
+ *  produce a compound label (e.g. "English/Chinese") but `lang` stays null
+ *  so sticky-pref matching only ever resolves to a single language. */
+export function detectLanguageFromName(filename: string): DetectedLanguage {
+  const ext = path.extname(filename).toLowerCase();
+  const stem = ext ? filename.slice(0, -ext.length) : filename;
+  const tokens = stem.toLowerCase().split(TOKEN_SPLIT).filter(Boolean);
+  const found = new Set<LangIso>();
+  for (const t of tokens) {
+    const iso = normalizeLanguageTag(t);
+    if (iso) found.add(iso);
+  }
+  if (found.size === 0) return { lang: null, label: "Unknown" };
+  if (found.size === 1) {
+    const iso = [...found][0]!;
+    return { lang: iso, label: languageDisplay(iso) };
+  }
+  const order: LangIso[] = ["eng", "zho", "jpn"];
+  const ordered = order.filter((i) => found.has(i));
+  return { lang: null, label: ordered.map(languageDisplay).join("/") };
+}
+
+const SRT_TIMESTAMP = /(\d{1,2}:\d{2}:\d{2}),(\d{3})/g;
+
+/** Pure JS SRT → WebVTT converter. Strips BOM, normalizes CRLF, swaps
+ *  the comma in HH:MM:SS,mmm timestamps for a dot, and prepends the
+ *  WEBVTT header. No styling translation. Cheap; runs on every sidecar
+ *  miss without spawning ffmpeg. */
+export function srtToVtt(srt: string): string {
+  let body = srt;
+  if (body.charCodeAt(0) === 0xfeff) body = body.slice(1);
+  body = body.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
+  body = body.replace(SRT_TIMESTAMP, "$1.$2");
+  return `WEBVTT\n\n${body.trimStart()}`;
+}
+
+export function stemOf(filename: string): string {
+  const ext = path.extname(filename);
+  return ext ? filename.slice(0, -ext.length) : filename;
+}
+
+const REPLACEMENT_CHAR = "�";
+
+/**
+ * Decode a subtitle file buffer to a JS string with best-effort
+ * encoding detection. Many older Asian SRTs ship as cp936/GBK or
+ * Shift-JIS — feeding them through `Buffer.toString("utf8")` produces
+ * mojibake. Strategy:
+ *   1. Strip BOM if present (UTF-8 / UTF-16 LE / UTF-16 BE).
+ *   2. Try UTF-8 strict. If it decodes without invalid sequences, use it.
+ *   3. Otherwise decode as UTF-8 / shift_jis / gb18030 / big5 and
+ *      pick whichever has the fewest replacement chars per kbyte.
+ *   4. Tie-break preference: shift_jis when katakana/hiragana ranges
+ *      appear in the JS surrogates, gb18030 otherwise — common
+ *      heuristic for JP vs CN fansub source material.
+ */
+export function decodeSubtitleBuffer(buf: Buffer): string {
+  // BOM detection — if present, the encoding is unambiguous.
+  if (buf.length >= 3 && buf[0] === 0xef && buf[1] === 0xbb && buf[2] === 0xbf) {
+    return buf.subarray(3).toString("utf8");
+  }
+  if (buf.length >= 2 && buf[0] === 0xff && buf[1] === 0xfe) {
+    return iconv.decode(buf.subarray(2), "utf-16le");
+  }
+  if (buf.length >= 2 && buf[0] === 0xfe && buf[1] === 0xff) {
+    return iconv.decode(buf.subarray(2), "utf-16be");
+  }
+
+  // UTF-8 strict — fast path for the common case.
+  try {
+    const decoder = new TextDecoder("utf-8", { fatal: true });
+    return decoder.decode(buf);
+  } catch { /* fall through to heuristic */ }
+
+  // Compare candidate encodings by replacement-char count.
+  const candidates: Array<"utf8" | "shift_jis" | "gb18030" | "big5"> = [
+    "utf8", "shift_jis", "gb18030", "big5",
+  ];
+  let best: { encoding: typeof candidates[number]; text: string; score: number } | null = null;
+  for (const encoding of candidates) {
+    const text = iconv.decode(buf, encoding);
+    let bad = 0;
+    for (let i = 0; i < text.length; i++) {
+      if (text[i] === REPLACEMENT_CHAR) bad++;
+    }
+    // Tie-break preference: shift_jis when text contains kana, since
+    // gb18030 happens to map many JP code points without errors but
+    // produces gibberish that we wouldn't catch by rep-count alone.
+    const hasKana = /[぀-ヿ]/.test(text);
+    const adjusted = hasKana && encoding === "shift_jis"
+      ? bad - 1
+      : encoding === "utf8" ? bad - 1 : bad;
+    if (best == null || adjusted < best.score) {
+      best = { encoding, text, score: adjusted };
+    }
+  }
+  return best?.text ?? buf.toString("utf8");
+}