pinkudex/lib/video/subtitles.ts

import "server-only";
import path from "node:path";
import fs from "node:fs/promises";
import iconv from "iconv-lite";

export const SUBTITLE_EXTS = [".srt", ".vtt", ".ass", ".ssa"] as const;
export type SubtitleExt = (typeof SUBTITLE_EXTS)[number];

const SUBTITLE_EXT_SET = new Set<string>(SUBTITLE_EXTS);

export type LangIso = "eng" | "zho" | "jpn";
export type LangPref = "EN" | "CN" | "JP" | "off";

export interface SubtitleFileEntry {
  abs: string;
  filename: string;
}

export async function walkSubtitles(root: string, maxDepth = 2): Promise<SubtitleFileEntry[]> {
  const out: SubtitleFileEntry[] = [];
  type Frame = { dir: string; depth: number };
  const stack: Frame[] = [{ dir: root, depth: 0 }];
  while (stack.length) {
    const { dir, depth } = stack.pop()!;
    let entries: import("node:fs").Dirent[];
    try {
      entries = await fs.readdir(dir, { withFileTypes: true });
    } catch {
      continue;
    }
    for (const e of entries) {
      const full = path.join(dir, e.name);
      if (e.isDirectory()) {
        if (depth < maxDepth) stack.push({ dir: full, depth: depth + 1 });
      } else if (e.isFile()) {
        const ext = path.extname(e.name).toLowerCase();
        if (SUBTITLE_EXT_SET.has(ext)) out.push({ abs: full, filename: e.name });
      }
    }
  }
  return out;
}

const PREF_TO_ISO: Record<Exclude<LangPref, "off">, LangIso> = {
  EN: "eng",
  CN: "zho",
  JP: "jpn",
};

const ISO_TO_PREF: Record<LangIso, Exclude<LangPref, "off">> = {
  eng: "EN",
  zho: "CN",
  jpn: "JP",
};

export function isoFromPref(pref: LangPref): LangIso | null {
  return pref === "off" ? null : PREF_TO_ISO[pref];
}

export function prefFromIso(iso: LangIso | null): LangPref {
  return iso == null ? "off" : ISO_TO_PREF[iso];
}

const ENGLISH_TOKENS = new Set(["en", "eng", "english"]);
const CHINESE_TOKENS = new Set([
  "zh", "zho", "chi", "chs", "cht", "chn", "cn", "chinese",
  "schinese", "tchinese", "simplified", "traditional",
  "zh-cn", "zh-tw", "zh-hans", "zh-hant",
]);
const JAPANESE_TOKENS = new Set(["ja", "jp", "jpn", "japanese", "jap"]);

export function normalizeLanguageTag(tag: string | null | undefined): LangIso | null {
  if (!tag) return null;
  const lower = tag.trim().toLowerCase();
  if (!lower) return null;
  if (ENGLISH_TOKENS.has(lower)) return "eng";
  if (CHINESE_TOKENS.has(lower)) return "zho";
  if (JAPANESE_TOKENS.has(lower)) return "jpn";
  return null;
}

export function languageDisplay(iso: LangIso | null): string {
  if (iso === "eng") return "English";
  if (iso === "zho") return "Chinese";
  if (iso === "jpn") return "Japanese";
  return "Unknown";
}

const TOKEN_SPLIT = /[\s._\-\[\]()+,;]+/g;

export interface DetectedLanguage {
  /** Single ISO code if exactly one language was detected. */
  lang: LangIso | null;
  /** Display label — "English", "Chinese", "English/Chinese", "Unknown". */
  label: string;
}

/** Inspect a filename's stem for embedded language hints. Multiple hits
 *  produce a compound label (e.g. "English/Chinese") but `lang` stays null
 *  so sticky-pref matching only ever resolves to a single language. */
export function detectLanguageFromName(filename: string): DetectedLanguage {
  const ext = path.extname(filename).toLowerCase();
  const stem = ext ? filename.slice(0, -ext.length) : filename;
  const tokens = stem.toLowerCase().split(TOKEN_SPLIT).filter(Boolean);
  const found = new Set<LangIso>();
  for (const t of tokens) {
    const iso = normalizeLanguageTag(t);
    if (iso) found.add(iso);
  }
  if (found.size === 0) return { lang: null, label: "Unknown" };
  if (found.size === 1) {
    const iso = [...found][0]!;
    return { lang: iso, label: languageDisplay(iso) };
  }
  const order: LangIso[] = ["eng", "zho", "jpn"];
  const ordered = order.filter((i) => found.has(i));
  return { lang: null, label: ordered.map(languageDisplay).join("/") };
}

const SRT_TIMESTAMP = /(\d{1,2}:\d{2}:\d{2}),(\d{3})/g;

/** Pure JS SRT → WebVTT converter. Strips BOM, normalizes CRLF, swaps
 *  the comma in HH:MM:SS,mmm timestamps for a dot, and prepends the
 *  WEBVTT header. No styling translation. Cheap; runs on every sidecar
 *  miss without spawning ffmpeg. */
export function srtToVtt(srt: string): string {
  let body = srt;
  if (body.charCodeAt(0) === 0xfeff) body = body.slice(1);
  body = body.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
  body = body.replace(SRT_TIMESTAMP, "$1.$2");
  return `WEBVTT\n\n${body.trimStart()}`;
}

export function stemOf(filename: string): string {
  const ext = path.extname(filename);
  return ext ? filename.slice(0, -ext.length) : filename;
}

const REPLACEMENT_CHAR = "�";

/**
 * Decode a subtitle file buffer to a JS string with best-effort
 * encoding detection. Many older Asian SRTs ship as cp936/GBK or
 * Shift-JIS — feeding them through `Buffer.toString("utf8")` produces
 * mojibake. Strategy:
 *   1. Strip BOM if present (UTF-8 / UTF-16 LE / UTF-16 BE).
 *   2. Try UTF-8 strict. If it decodes without invalid sequences, use it.
 *   3. Otherwise decode as UTF-8 / shift_jis / gb18030 / big5 and
 *      pick whichever has the fewest replacement chars per kbyte.
 *   4. Tie-break preference: shift_jis when katakana/hiragana ranges
 *      appear in the JS surrogates, gb18030 otherwise — common
 *      heuristic for JP vs CN fansub source material.
 */
export function decodeSubtitleBuffer(buf: Buffer): string {
  // BOM detection — if present, the encoding is unambiguous.
  if (buf.length >= 3 && buf[0] === 0xef && buf[1] === 0xbb && buf[2] === 0xbf) {
    return buf.subarray(3).toString("utf8");
  }
  if (buf.length >= 2 && buf[0] === 0xff && buf[1] === 0xfe) {
    return iconv.decode(buf.subarray(2), "utf-16le");
  }
  if (buf.length >= 2 && buf[0] === 0xfe && buf[1] === 0xff) {
    return iconv.decode(buf.subarray(2), "utf-16be");
  }

  // UTF-8 strict — fast path for the common case.
  try {
    const decoder = new TextDecoder("utf-8", { fatal: true });
    return decoder.decode(buf);
  } catch { /* fall through to heuristic */ }

  // Compare candidate encodings by replacement-char count.
  const candidates: Array<"utf8" | "shift_jis" | "gb18030" | "big5"> = [
    "utf8", "shift_jis", "gb18030", "big5",
  ];
  let best: { encoding: typeof candidates[number]; text: string; score: number } | null = null;
  for (const encoding of candidates) {
    const text = iconv.decode(buf, encoding);
    let bad = 0;
    for (let i = 0; i < text.length; i++) {
      if (text[i] === REPLACEMENT_CHAR) bad++;
    }
    // Tie-break preference: shift_jis when text contains kana, since
    // gb18030 happens to map many JP code points without errors but
    // produces gibberish that we wouldn't catch by rep-count alone.
    const hasKana = /[぀-ヿ]/.test(text);
    const adjusted = hasKana && encoding === "shift_jis"
      ? bad - 1
      : encoding === "utf8" ? bad - 1 : bad;
    if (best == null || adjusted < best.score) {
      best = { encoding, text, score: adjusted };
    }
  }
  return best?.text ?? buf.toString("utf8");
}