196 lines
6.9 KiB
TypeScript
196 lines
6.9 KiB
TypeScript
import "server-only";
|
|
import path from "node:path";
|
|
import fs from "node:fs/promises";
|
|
import iconv from "iconv-lite";
|
|
|
|
export const SUBTITLE_EXTS = [".srt", ".vtt", ".ass", ".ssa"] as const;
|
|
export type SubtitleExt = (typeof SUBTITLE_EXTS)[number];
|
|
|
|
const SUBTITLE_EXT_SET = new Set<string>(SUBTITLE_EXTS);
|
|
|
|
export type LangIso = "eng" | "zho" | "jpn";
|
|
export type LangPref = "EN" | "CN" | "JP" | "off";
|
|
|
|
export interface SubtitleFileEntry {
|
|
abs: string;
|
|
filename: string;
|
|
}
|
|
|
|
export async function walkSubtitles(root: string, maxDepth = 2): Promise<SubtitleFileEntry[]> {
|
|
const out: SubtitleFileEntry[] = [];
|
|
type Frame = { dir: string; depth: number };
|
|
const stack: Frame[] = [{ dir: root, depth: 0 }];
|
|
while (stack.length) {
|
|
const { dir, depth } = stack.pop()!;
|
|
let entries: import("node:fs").Dirent[];
|
|
try {
|
|
entries = await fs.readdir(dir, { withFileTypes: true });
|
|
} catch {
|
|
continue;
|
|
}
|
|
for (const e of entries) {
|
|
const full = path.join(dir, e.name);
|
|
if (e.isDirectory()) {
|
|
if (depth < maxDepth) stack.push({ dir: full, depth: depth + 1 });
|
|
} else if (e.isFile()) {
|
|
const ext = path.extname(e.name).toLowerCase();
|
|
if (SUBTITLE_EXT_SET.has(ext)) out.push({ abs: full, filename: e.name });
|
|
}
|
|
}
|
|
}
|
|
return out;
|
|
}
|
|
|
|
const PREF_TO_ISO: Record<Exclude<LangPref, "off">, LangIso> = {
|
|
EN: "eng",
|
|
CN: "zho",
|
|
JP: "jpn",
|
|
};
|
|
|
|
const ISO_TO_PREF: Record<LangIso, Exclude<LangPref, "off">> = {
|
|
eng: "EN",
|
|
zho: "CN",
|
|
jpn: "JP",
|
|
};
|
|
|
|
export function isoFromPref(pref: LangPref): LangIso | null {
|
|
return pref === "off" ? null : PREF_TO_ISO[pref];
|
|
}
|
|
|
|
export function prefFromIso(iso: LangIso | null): LangPref {
|
|
return iso == null ? "off" : ISO_TO_PREF[iso];
|
|
}
|
|
|
|
const ENGLISH_TOKENS = new Set(["en", "eng", "english"]);
|
|
const CHINESE_TOKENS = new Set([
|
|
"zh", "zho", "chi", "chs", "cht", "chn", "cn", "chinese",
|
|
"schinese", "tchinese", "simplified", "traditional",
|
|
"zh-cn", "zh-tw", "zh-hans", "zh-hant",
|
|
]);
|
|
const JAPANESE_TOKENS = new Set(["ja", "jp", "jpn", "japanese", "jap"]);
|
|
|
|
export function normalizeLanguageTag(tag: string | null | undefined): LangIso | null {
|
|
if (!tag) return null;
|
|
const lower = tag.trim().toLowerCase();
|
|
if (!lower) return null;
|
|
if (ENGLISH_TOKENS.has(lower)) return "eng";
|
|
if (CHINESE_TOKENS.has(lower)) return "zho";
|
|
if (JAPANESE_TOKENS.has(lower)) return "jpn";
|
|
return null;
|
|
}
|
|
|
|
export function languageDisplay(iso: LangIso | null): string {
|
|
if (iso === "eng") return "English";
|
|
if (iso === "zho") return "Chinese";
|
|
if (iso === "jpn") return "Japanese";
|
|
return "Unknown";
|
|
}
|
|
|
|
const TOKEN_SPLIT = /[\s._\-\[\]()+,;]+/g;
|
|
|
|
export interface DetectedLanguage {
|
|
/** Single ISO code if exactly one language was detected. */
|
|
lang: LangIso | null;
|
|
/** Display label — "English", "Chinese", "English/Chinese", "Unknown". */
|
|
label: string;
|
|
}
|
|
|
|
/** Inspect a filename's stem for embedded language hints. Multiple hits
|
|
* produce a compound label (e.g. "English/Chinese") but `lang` stays null
|
|
* so sticky-pref matching only ever resolves to a single language. */
|
|
export function detectLanguageFromName(filename: string): DetectedLanguage {
|
|
const ext = path.extname(filename).toLowerCase();
|
|
const stem = ext ? filename.slice(0, -ext.length) : filename;
|
|
const tokens = stem.toLowerCase().split(TOKEN_SPLIT).filter(Boolean);
|
|
const found = new Set<LangIso>();
|
|
for (const t of tokens) {
|
|
const iso = normalizeLanguageTag(t);
|
|
if (iso) found.add(iso);
|
|
}
|
|
if (found.size === 0) return { lang: null, label: "Unknown" };
|
|
if (found.size === 1) {
|
|
const iso = [...found][0]!;
|
|
return { lang: iso, label: languageDisplay(iso) };
|
|
}
|
|
const order: LangIso[] = ["eng", "zho", "jpn"];
|
|
const ordered = order.filter((i) => found.has(i));
|
|
return { lang: null, label: ordered.map(languageDisplay).join("/") };
|
|
}
|
|
|
|
const SRT_TIMESTAMP = /(\d{1,2}:\d{2}:\d{2}),(\d{3})/g;
|
|
|
|
/** Pure JS SRT → WebVTT converter. Strips BOM, normalizes CRLF, swaps
|
|
* the comma in HH:MM:SS,mmm timestamps for a dot, and prepends the
|
|
* WEBVTT header. No styling translation. Cheap; runs on every sidecar
|
|
* miss without spawning ffmpeg. */
|
|
export function srtToVtt(srt: string): string {
|
|
let body = srt;
|
|
if (body.charCodeAt(0) === 0xfeff) body = body.slice(1);
|
|
body = body.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
|
|
body = body.replace(SRT_TIMESTAMP, "$1.$2");
|
|
return `WEBVTT\n\n${body.trimStart()}`;
|
|
}
|
|
|
|
export function stemOf(filename: string): string {
|
|
const ext = path.extname(filename);
|
|
return ext ? filename.slice(0, -ext.length) : filename;
|
|
}
|
|
|
|
const REPLACEMENT_CHAR = "�";
|
|
|
|
/**
|
|
* Decode a subtitle file buffer to a JS string with best-effort
|
|
* encoding detection. Many older Asian SRTs ship as cp936/GBK or
|
|
* Shift-JIS — feeding them through `Buffer.toString("utf8")` produces
|
|
* mojibake. Strategy:
|
|
* 1. Strip BOM if present (UTF-8 / UTF-16 LE / UTF-16 BE).
|
|
* 2. Try UTF-8 strict. If it decodes without invalid sequences, use it.
|
|
* 3. Otherwise decode as UTF-8 / shift_jis / gb18030 / big5 and
|
|
* pick whichever has the fewest replacement chars per kbyte.
|
|
* 4. Tie-break preference: shift_jis when katakana/hiragana ranges
|
|
* appear in the JS surrogates, gb18030 otherwise — common
|
|
* heuristic for JP vs CN fansub source material.
|
|
*/
|
|
export function decodeSubtitleBuffer(buf: Buffer): string {
|
|
// BOM detection — if present, the encoding is unambiguous.
|
|
if (buf.length >= 3 && buf[0] === 0xef && buf[1] === 0xbb && buf[2] === 0xbf) {
|
|
return buf.subarray(3).toString("utf8");
|
|
}
|
|
if (buf.length >= 2 && buf[0] === 0xff && buf[1] === 0xfe) {
|
|
return iconv.decode(buf.subarray(2), "utf-16le");
|
|
}
|
|
if (buf.length >= 2 && buf[0] === 0xfe && buf[1] === 0xff) {
|
|
return iconv.decode(buf.subarray(2), "utf-16be");
|
|
}
|
|
|
|
// UTF-8 strict — fast path for the common case.
|
|
try {
|
|
const decoder = new TextDecoder("utf-8", { fatal: true });
|
|
return decoder.decode(buf);
|
|
} catch { /* fall through to heuristic */ }
|
|
|
|
// Compare candidate encodings by replacement-char count.
|
|
const candidates: Array<"utf8" | "shift_jis" | "gb18030" | "big5"> = [
|
|
"utf8", "shift_jis", "gb18030", "big5",
|
|
];
|
|
let best: { encoding: typeof candidates[number]; text: string; score: number } | null = null;
|
|
for (const encoding of candidates) {
|
|
const text = iconv.decode(buf, encoding);
|
|
let bad = 0;
|
|
for (let i = 0; i < text.length; i++) {
|
|
if (text[i] === REPLACEMENT_CHAR) bad++;
|
|
}
|
|
// Tie-break preference: shift_jis when text contains kana, since
|
|
// gb18030 happens to map many JP code points without errors but
|
|
// produces gibberish that we wouldn't catch by rep-count alone.
|
|
const hasKana = /[-ヿ]/.test(text);
|
|
const adjusted = hasKana && encoding === "shift_jis"
|
|
? bad - 1
|
|
: encoding === "utf8" ? bad - 1 : bad;
|
|
if (best == null || adjusted < best.score) {
|
|
best = { encoding, text, score: adjusted };
|
|
}
|
|
}
|
|
return best?.text ?? buf.toString("utf8");
|
|
}
|