Initial commit

This commit is contained in:
admin
2026-05-26 22:46:00 +02:00
commit 7e2c2ff89c
256 changed files with 51523 additions and 0 deletions
+195
View File
@@ -0,0 +1,195 @@
import "server-only";
import path from "node:path";
import fs from "node:fs/promises";
import iconv from "iconv-lite";
export const SUBTITLE_EXTS = [".srt", ".vtt", ".ass", ".ssa"] as const;
export type SubtitleExt = (typeof SUBTITLE_EXTS)[number];
const SUBTITLE_EXT_SET = new Set<string>(SUBTITLE_EXTS);
export type LangIso = "eng" | "zho" | "jpn";
export type LangPref = "EN" | "CN" | "JP" | "off";
export interface SubtitleFileEntry {
abs: string;
filename: string;
}
export async function walkSubtitles(root: string, maxDepth = 2): Promise<SubtitleFileEntry[]> {
const out: SubtitleFileEntry[] = [];
type Frame = { dir: string; depth: number };
const stack: Frame[] = [{ dir: root, depth: 0 }];
while (stack.length) {
const { dir, depth } = stack.pop()!;
let entries: import("node:fs").Dirent[];
try {
entries = await fs.readdir(dir, { withFileTypes: true });
} catch {
continue;
}
for (const e of entries) {
const full = path.join(dir, e.name);
if (e.isDirectory()) {
if (depth < maxDepth) stack.push({ dir: full, depth: depth + 1 });
} else if (e.isFile()) {
const ext = path.extname(e.name).toLowerCase();
if (SUBTITLE_EXT_SET.has(ext)) out.push({ abs: full, filename: e.name });
}
}
}
return out;
}
const PREF_TO_ISO: Record<Exclude<LangPref, "off">, LangIso> = {
EN: "eng",
CN: "zho",
JP: "jpn",
};
const ISO_TO_PREF: Record<LangIso, Exclude<LangPref, "off">> = {
eng: "EN",
zho: "CN",
jpn: "JP",
};
export function isoFromPref(pref: LangPref): LangIso | null {
return pref === "off" ? null : PREF_TO_ISO[pref];
}
export function prefFromIso(iso: LangIso | null): LangPref {
return iso == null ? "off" : ISO_TO_PREF[iso];
}
const ENGLISH_TOKENS = new Set(["en", "eng", "english"]);
const CHINESE_TOKENS = new Set([
"zh", "zho", "chi", "chs", "cht", "chn", "cn", "chinese",
"schinese", "tchinese", "simplified", "traditional",
"zh-cn", "zh-tw", "zh-hans", "zh-hant",
]);
const JAPANESE_TOKENS = new Set(["ja", "jp", "jpn", "japanese", "jap"]);
export function normalizeLanguageTag(tag: string | null | undefined): LangIso | null {
if (!tag) return null;
const lower = tag.trim().toLowerCase();
if (!lower) return null;
if (ENGLISH_TOKENS.has(lower)) return "eng";
if (CHINESE_TOKENS.has(lower)) return "zho";
if (JAPANESE_TOKENS.has(lower)) return "jpn";
return null;
}
export function languageDisplay(iso: LangIso | null): string {
if (iso === "eng") return "English";
if (iso === "zho") return "Chinese";
if (iso === "jpn") return "Japanese";
return "Unknown";
}
const TOKEN_SPLIT = /[\s._\-\[\]()+,;]+/g;
export interface DetectedLanguage {
/** Single ISO code if exactly one language was detected. */
lang: LangIso | null;
/** Display label — "English", "Chinese", "English/Chinese", "Unknown". */
label: string;
}
/** Inspect a filename's stem for embedded language hints. Multiple hits
* produce a compound label (e.g. "English/Chinese") but `lang` stays null
* so sticky-pref matching only ever resolves to a single language. */
export function detectLanguageFromName(filename: string): DetectedLanguage {
const ext = path.extname(filename).toLowerCase();
const stem = ext ? filename.slice(0, -ext.length) : filename;
const tokens = stem.toLowerCase().split(TOKEN_SPLIT).filter(Boolean);
const found = new Set<LangIso>();
for (const t of tokens) {
const iso = normalizeLanguageTag(t);
if (iso) found.add(iso);
}
if (found.size === 0) return { lang: null, label: "Unknown" };
if (found.size === 1) {
const iso = [...found][0]!;
return { lang: iso, label: languageDisplay(iso) };
}
const order: LangIso[] = ["eng", "zho", "jpn"];
const ordered = order.filter((i) => found.has(i));
return { lang: null, label: ordered.map(languageDisplay).join("/") };
}
const SRT_TIMESTAMP = /(\d{1,2}:\d{2}:\d{2}),(\d{3})/g;
/** Pure JS SRT → WebVTT converter. Strips BOM, normalizes CRLF, swaps
* the comma in HH:MM:SS,mmm timestamps for a dot, and prepends the
* WEBVTT header. No styling translation. Cheap; runs on every sidecar
* miss without spawning ffmpeg. */
export function srtToVtt(srt: string): string {
let body = srt;
if (body.charCodeAt(0) === 0xfeff) body = body.slice(1);
body = body.replace(/\r\n/g, "\n").replace(/\r/g, "\n");
body = body.replace(SRT_TIMESTAMP, "$1.$2");
return `WEBVTT\n\n${body.trimStart()}`;
}
export function stemOf(filename: string): string {
const ext = path.extname(filename);
return ext ? filename.slice(0, -ext.length) : filename;
}
const REPLACEMENT_CHAR = "";
/**
* Decode a subtitle file buffer to a JS string with best-effort
* encoding detection. Many older Asian SRTs ship as cp936/GBK or
* Shift-JIS — feeding them through `Buffer.toString("utf8")` produces
* mojibake. Strategy:
* 1. Strip BOM if present (UTF-8 / UTF-16 LE / UTF-16 BE).
* 2. Try UTF-8 strict. If it decodes without invalid sequences, use it.
* 3. Otherwise decode as UTF-8 / shift_jis / gb18030 / big5 and
* pick whichever has the fewest replacement chars per kbyte.
* 4. Tie-break preference: shift_jis when katakana/hiragana ranges
* appear in the JS surrogates, gb18030 otherwise — common
* heuristic for JP vs CN fansub source material.
*/
export function decodeSubtitleBuffer(buf: Buffer): string {
// BOM detection — if present, the encoding is unambiguous.
if (buf.length >= 3 && buf[0] === 0xef && buf[1] === 0xbb && buf[2] === 0xbf) {
return buf.subarray(3).toString("utf8");
}
if (buf.length >= 2 && buf[0] === 0xff && buf[1] === 0xfe) {
return iconv.decode(buf.subarray(2), "utf-16le");
}
if (buf.length >= 2 && buf[0] === 0xfe && buf[1] === 0xff) {
return iconv.decode(buf.subarray(2), "utf-16be");
}
// UTF-8 strict — fast path for the common case.
try {
const decoder = new TextDecoder("utf-8", { fatal: true });
return decoder.decode(buf);
} catch { /* fall through to heuristic */ }
// Compare candidate encodings by replacement-char count.
const candidates: Array<"utf8" | "shift_jis" | "gb18030" | "big5"> = [
"utf8", "shift_jis", "gb18030", "big5",
];
let best: { encoding: typeof candidates[number]; text: string; score: number } | null = null;
for (const encoding of candidates) {
const text = iconv.decode(buf, encoding);
let bad = 0;
for (let i = 0; i < text.length; i++) {
if (text[i] === REPLACEMENT_CHAR) bad++;
}
// Tie-break preference: shift_jis when text contains kana, since
// gb18030 happens to map many JP code points without errors but
// produces gibberish that we wouldn't catch by rep-count alone.
const hasKana = /[぀-ヿ]/.test(text);
const adjusted = hasKana && encoding === "shift_jis"
? bad - 1
: encoding === "utf8" ? bad - 1 : bad;
if (best == null || adjusted < best.score) {
best = { encoding, text, score: adjusted };
}
}
return best?.text ?? buf.toString("utf8");
}