Files
pinkudex/lib/video/index.ts
T
2026-05-26 22:46:00 +02:00

540 lines
19 KiB
TypeScript

import "server-only";
import path from "node:path";
import fs from "node:fs/promises";
import { extractCode, normalizeCode } from "@/lib/jav/codeParser";
import { getAppSetting } from "@/lib/db/appSettings";
import { rawDb } from "@/lib/db/client";
import { syncVideoMetadataIndex } from "./metadata";
export const VIDEO_EXTENSIONS = new Set([
".mp4", ".mkv", ".m4v", ".mov", ".webm", ".avi", ".wmv", ".ts", ".mpg", ".mpeg", ".flv",
]);
const SUBTITLE_EXTENSIONS = new Set([".srt", ".vtt", ".ass", ".ssa"]);
/** One video file the index found on disk. */
export interface VideoFile {
/** Absolute path on disk. */
abs: string;
/** Path relative to the configured video library root. */
rel: string;
/** Filename (with extension). */
filename: string;
/** Normalized JAV code parsed from the filename. */
code: string;
/** File size in bytes. */
size: number;
/** Last-modified timestamp (ms). */
mtime: number;
}
/**
* Lightweight scan-state record. Authoritative file data lives in the
* `video_metadata` SQLite table — accessors below query it directly,
* so this struct holds only what describes the most recent rescan.
*/
interface VideoIndex {
/** When the index was last built. */
lastScannedAt: number;
/** All folder roots that were scanned, in order: main first, extras after.
* Used both to display in the UI and to detect setting changes. */
rootsScanned: string[];
/** Total files matched by the most recent scan. */
count: number;
}
const EMPTY_INDEX: VideoIndex = {
lastScannedAt: 0,
rootsScanned: [],
count: 0,
};
let cachedScanState: VideoIndex = EMPTY_INDEX;
let scanInFlight: Promise<VideoIndex> | null = null;
interface CachedFileRow {
abs_path: string;
rel_path: string;
code: string;
size_bytes: number;
mtime_ms: number;
}
interface WalkOpts {
/** When true, ignore the dir-mtime cache and re-readdir every dir.
* Use after structural file edits that don't change dir mtime
* (e.g. content rewrite without rename). */
force?: boolean;
}
/**
* Walk the configured roots and produce a flat VideoFile[]. The caller
* writes the result to the `video_metadata` table — nothing is held in
* memory beyond the duration of one rescan.
*
* Incremental: each directory's mtime is compared to a stored value
* in `video_dir_mtimes`. If unchanged, the immediate-children file
* rows for that dir are reused from `video_metadata` instead of
* readdir + stat per file. Subdirs are still walked (their mtimes
* may have changed independently).
*/
async function walkAllRoots(
roots: string[],
opts: WalkOpts = {},
): Promise<{ files: VideoFile[]; count: number; visitedDirs: Set<string>; reused: number; rescanned: number }> {
const cachedMtimes = opts.force
? new Map<string, number>()
: loadDirMtimeCache();
const visitedDirs = new Set<string>();
const files: VideoFile[] = [];
const cachedFilesByDir = opts.force
? new Map<string, CachedFileRow[]>()
: loadCachedFileIndex();
let reused = 0;
let rescanned = 0;
for (const root of roots) {
type Frame = { dir: string };
const stack: Frame[] = [{ dir: root }];
while (stack.length) {
const { dir } = stack.pop()!;
visitedDirs.add(dir);
let dirStat: import("node:fs").Stats;
try {
dirStat = await fs.stat(dir);
} catch {
continue; // dir vanished mid-walk
}
const cachedMtime = cachedMtimes.get(dir);
const dirUnchanged = cachedMtime != null && cachedMtime === dirStat.mtimeMs;
// Always recurse — subdir mtimes are tracked independently.
// For *children* enumeration we use cached rows when unchanged.
// We still need the subdir list either way; if we're skipping
// the readdir for cache reuse, we need an alternate way to find
// subdirs. Cheapest: readdir the directory entries once for
// dirs (tiny per-dir cost) and use the dirent type directly.
let entries: import("node:fs").Dirent[];
try {
entries = await fs.readdir(dir, { withFileTypes: true });
} catch {
continue;
}
// Push subdirs onto the stack regardless of cache state.
for (const e of entries) {
if (e.isDirectory()) {
stack.push({ dir: path.join(dir, e.name) });
}
}
if (dirUnchanged) {
// Reuse cached rows for files immediately in this directory.
const cached = cachedFilesByDir.get(dir);
if (cached) {
for (const row of cached) {
files.push({
abs: row.abs_path,
rel: path.relative(root, row.abs_path),
filename: path.basename(row.abs_path),
code: row.code,
size: row.size_bytes,
mtime: row.mtime_ms,
});
}
reused += cached.length;
}
continue;
}
// Dir changed (or no cache entry yet). Readdir + stat each file.
rescanned++;
for (const e of entries) {
if (!e.isFile()) continue;
const ext = path.extname(e.name).toLowerCase();
if (!VIDEO_EXTENSIONS.has(ext)) continue;
const abs = path.join(dir, e.name);
const stem = e.name.slice(0, e.name.length - ext.length);
const code = extractCode(stem);
if (!code) continue;
const norm = normalizeCode(code);
if (!norm) continue;
let st: import("node:fs").Stats;
try {
st = await fs.stat(abs);
} catch {
continue;
}
files.push({
abs,
rel: path.relative(root, abs),
filename: e.name,
code: norm,
size: st.size,
mtime: st.mtimeMs,
});
}
// Update cached mtime so the NEXT scan sees this dir as fresh.
cachedMtimes.set(dir, dirStat.mtimeMs);
}
}
// Persist updated mtime cache for next scan.
saveDirMtimeCache(cachedMtimes, visitedDirs);
// Stable order across rescans.
files.sort((a, b) => a.code.localeCompare(b.code) || a.filename.localeCompare(b.filename));
return { files, count: files.length, visitedDirs, reused, rescanned };
}
/** Load all `video_dir_mtimes` rows into a Map keyed by abs_dir. */
function loadDirMtimeCache(): Map<string, number> {
const rows = rawDb.prepare(`SELECT abs_dir, mtime_ms FROM video_dir_mtimes`).all() as Array<{ abs_dir: string; mtime_ms: number }>;
const out = new Map<string, number>();
for (const r of rows) out.set(r.abs_dir, r.mtime_ms);
return out;
}
/** Group the entire video_metadata table by dir_path so dir-cache
* reuse is a single in-memory lookup per dir. One linear scan of the
* table — cheap even at 80k rows. */
function loadCachedFileIndex(): Map<string, CachedFileRow[]> {
const rows = rawDb.prepare(`
SELECT abs_path, rel_path, code, size_bytes, mtime_ms, dir_path
FROM video_metadata
`).all() as Array<CachedFileRow & { dir_path: string }>;
const out = new Map<string, CachedFileRow[]>();
for (const r of rows) {
const arr = out.get(r.dir_path);
if (arr) arr.push(r);
else out.set(r.dir_path, [r]);
}
return out;
}
/** Upsert dir mtimes for visited dirs and prune rows for dirs we
* didn't see this scan (deleted folders). */
function saveDirMtimeCache(mtimes: Map<string, number>, visited: Set<string>): void {
const upsert = rawDb.prepare(`
INSERT INTO video_dir_mtimes (abs_dir, mtime_ms, last_seen_at)
VALUES (?, ?, ?)
ON CONFLICT(abs_dir) DO UPDATE SET
mtime_ms = excluded.mtime_ms,
last_seen_at = excluded.last_seen_at
`);
const now = Date.now();
const tx = rawDb.transaction(() => {
for (const [dir, mtime] of mtimes) {
// Only persist dirs we actually visited this scan — others may
// have been moved/renamed and their cache entry is stale.
if (!visited.has(dir)) continue;
upsert.run(dir, mtime, now);
}
// Prune rows whose dir we didn't see this scan. Drops cleanup of
// deleted dirs in O(rows) — fine at any reasonable scale.
const allRows = rawDb.prepare(`SELECT abs_dir FROM video_dir_mtimes`).all() as Array<{ abs_dir: string }>;
const del = rawDb.prepare(`DELETE FROM video_dir_mtimes WHERE abs_dir = ?`);
for (const r of allRows) {
if (!visited.has(r.abs_dir)) del.run(r.abs_dir);
}
});
try {
tx();
} catch (e) {
console.error("[video] failed to save dir mtime cache:", e);
}
}
/**
* Walk every place a sidecar subtitle could live and return the set of
* canonical codes that have at least one. Cheap signal — no ffprobe.
*
* - Each video's own directory, filtered to filenames that start with
* the video stem (so a stray `OTHER-001.srt` next to `YUJ-001.mp4`
* doesn't taint YUJ-001).
* - Each entry in `subtitleExtraPaths` (recursive walk, depth 3) —
* extracts the code from the filename directly.
* - data/generated-subtitles/<code>/ — directory name IS the code.
*
* Result is consumed once by syncHasSubtitleColumn and discarded — no
* persistent in-memory copy.
*/
async function collectSubtitleCodes(files: VideoFile[]): Promise<Set<string>> {
const codes = new Set<string>();
// Same-folder scan: per video, look at sibling files. Cache directory
// listings so a folder with N videos is only listed once.
const dirCache = new Map<string, import("node:fs").Dirent[]>();
for (const file of files) {
const dir = path.dirname(file.abs);
let entries = dirCache.get(dir);
if (!entries) {
try {
entries = await fs.readdir(dir, { withFileTypes: true });
} catch {
entries = [];
}
dirCache.set(dir, entries);
}
const stem = file.filename.slice(0, file.filename.length - path.extname(file.filename).length);
const stemLower = stem.toLowerCase();
const codeLower = file.code.toLowerCase();
for (const e of entries) {
if (!e.isFile()) continue;
const ext = path.extname(e.name).toLowerCase();
if (!SUBTITLE_EXTENSIONS.has(ext)) continue;
const lower = e.name.toLowerCase();
// Code-substring match must treat the code as a delimited token
// (start, end, or wrapped in non-alphanumeric) — bare `.includes`
// would attribute `notes-yuj-001-bad.srt` to YUJ-001.
const codeAsToken = (() => {
const idx = lower.indexOf(codeLower);
if (idx < 0) return false;
const before = idx === 0 ? "" : lower[idx - 1]!;
const afterIdx = idx + codeLower.length;
const after = afterIdx >= lower.length ? "" : lower[afterIdx]!;
const isBoundary = (c: string) => c === "" || !/[a-z0-9]/.test(c);
return isBoundary(before) && isBoundary(after);
})();
if (lower.startsWith(stemLower + ".") || lower === stemLower + ext || codeAsToken) {
codes.add(file.code);
break;
}
}
}
// Persistent subtitle library roots — extract codes from filenames.
const extraRoots = (getAppSetting("subtitleExtraPaths") ?? []).filter(Boolean);
for (const root of extraRoots) {
await walkSubtitleRoot(root, codes, 3);
}
// data/generated-subtitles/<code>/ — directory name is the code.
const generatedRoot = path.join(process.cwd(), "data", "generated-subtitles");
try {
const subdirs = await fs.readdir(generatedRoot, { withFileTypes: true });
for (const d of subdirs) {
if (!d.isDirectory()) continue;
const dirAbs = path.join(generatedRoot, d.name);
let entries: import("node:fs").Dirent[];
try {
entries = await fs.readdir(dirAbs, { withFileTypes: true });
} catch {
continue;
}
const hasSub = entries.some(
(e) => e.isFile() && SUBTITLE_EXTENSIONS.has(path.extname(e.name).toLowerCase()),
);
if (hasSub) {
const norm = normalizeCode(d.name);
if (norm) codes.add(norm);
}
}
} catch { /* generated-subtitles not present yet — fine */ }
return codes;
}
async function walkSubtitleRoot(root: string, out: Set<string>, maxDepth: number): Promise<void> {
type Frame = { dir: string; depth: number };
const stack: Frame[] = [{ dir: root, depth: 0 }];
while (stack.length) {
const { dir, depth } = stack.pop()!;
let entries: import("node:fs").Dirent[];
try {
entries = await fs.readdir(dir, { withFileTypes: true });
} catch {
continue;
}
for (const e of entries) {
const full = path.join(dir, e.name);
if (e.isDirectory()) {
if (depth < maxDepth) stack.push({ dir: full, depth: depth + 1 });
} else if (e.isFile()) {
const ext = path.extname(e.name).toLowerCase();
if (!SUBTITLE_EXTENSIONS.has(ext)) continue;
const stem = e.name.slice(0, e.name.length - ext.length);
const code = extractCode(stem);
if (!code) continue;
const norm = normalizeCode(code);
if (norm) out.add(norm);
}
}
}
}
/** Exposed for path-allowlist checks (e.g. subtitle file resolution). */
export function getConfiguredVideoRoots(): string[] {
return configuredRoots();
}
function configuredRoots(): string[] {
const main = (getAppSetting("videoLibraryPath") || "").trim();
const extras = getAppSetting("videoExtraPaths") ?? [];
const out: string[] = [];
if (main) out.push(main);
for (const e of extras) {
const t = (e ?? "").trim();
if (t) out.push(t);
}
return out;
}
function rootsEqual(a: string[], b: string[]): boolean {
if (a.length !== b.length) return false;
for (let i = 0; i < a.length; i++) if (a[i] !== b[i]) return false;
return true;
}
/**
* Scan-state probe — used by API routes to decide whether the cached
* data still matches current settings. Returns the empty state if the
* configured roots have changed (caller can trigger a rescan).
*/
export function getVideoIndex(): VideoIndex {
const roots = configuredRoots();
if (roots.length === 0) return EMPTY_INDEX;
if (!rootsEqual(cachedScanState.rootsScanned, roots)) return EMPTY_INDEX;
return cachedScanState;
}
/** Rebuild the index from disk. Coalesces concurrent calls. Authoritative
* data lands in the `video_metadata` table; this function returns only
* scan-state metadata.
*
* Default mode is incremental — directories whose mtime hasn't
* changed since the last scan reuse cached file rows without
* readdir-per-file. Pass `{force:true}` to bypass the dir-mtime
* cache (e.g. after content edits that don't bump dir mtime). */
export async function rescanVideoIndex(opts: { force?: boolean } = {}): Promise<VideoIndex> {
const roots = configuredRoots();
if (scanInFlight) return scanInFlight;
scanInFlight = (async () => {
try {
const cleanRoots = roots.map((r) => (r ?? "").trim()).filter(Boolean);
if (cleanRoots.length === 0) {
cachedScanState = { ...EMPTY_INDEX };
return cachedScanState;
}
const t0 = Date.now();
const { files, count, reused, rescanned } = await walkAllRoots(cleanRoots, { force: opts.force });
const walkMs = Date.now() - t0;
console.log(
`[video] rescan walk in ${walkMs}ms — ${count} files (${reused} reused, ${rescanned} dir(s) rewalked${opts.force ? ", forced" : ""})`,
);
// Persist the file table first — has_video / has_subtitle bulk
// updates and metadata sync all run off it.
await syncVideoMetadataIndex(files);
syncHasVideoColumn(files);
const subtitleCodes = await collectSubtitleCodes(files);
syncHasSubtitleColumn(subtitleCodes);
cachedScanState = {
lastScannedAt: Date.now(),
rootsScanned: cleanRoots,
count,
};
return cachedScanState;
} finally {
scanInFlight = null;
}
})();
return scanInFlight;
}
/**
* Mirror the freshly-walked code list into images.has_video so SQL
* filters / counts can use the column directly.
*/
function syncHasVideoColumn(files: VideoFile[]): void {
const codes = Array.from(new Set(files.map((f) => f.code)));
const tx = rawDb.transaction(() => {
rawDb.prepare(`UPDATE images SET has_video = 0 WHERE has_video = 1`).run();
if (codes.length === 0) return;
// Chunk to stay well below SQLite's bind-parameter cap.
const CHUNK = 500;
for (let i = 0; i < codes.length; i += CHUNK) {
const slice = codes.slice(i, i + CHUNK);
const placeholders = slice.map(() => "?").join(",");
rawDb.prepare(
`UPDATE images SET has_video = 1 WHERE upper(code) IN (${placeholders})`,
).run(...slice);
}
});
try {
tx();
} catch (e) {
console.error("[video] failed to sync has_video column:", e);
}
}
/** Mirror the freshly-walked subtitle code set into images.has_subtitle. */
function syncHasSubtitleColumn(subtitleCodes: Set<string>): void {
const codes = Array.from(subtitleCodes);
const tx = rawDb.transaction(() => {
rawDb.prepare(`UPDATE images SET has_subtitle = 0 WHERE has_subtitle = 1`).run();
if (codes.length === 0) return;
const CHUNK = 500;
for (let i = 0; i < codes.length; i += CHUNK) {
const slice = codes.slice(i, i + CHUNK);
const placeholders = slice.map(() => "?").join(",");
rawDb.prepare(
`UPDATE images SET has_subtitle = 1 WHERE upper(code) IN (${placeholders})`,
).run(...slice);
}
});
try {
tx();
} catch (e) {
console.error("[video] failed to sync has_subtitle column:", e);
}
}
interface VideoMetaRow {
abs_path: string;
rel_path: string;
code: string;
size_bytes: number;
mtime_ms: number;
}
/** Look up files for a single normalized code. Reads directly from the
* video_metadata table so the result is always current with the most
* recent rescan. */
export function findVideosForCode(code: string | null | undefined): VideoFile[] {
if (!code) return [];
const norm = normalizeCode(code) ?? code.toUpperCase();
const rows = rawDb.prepare(`
SELECT abs_path, rel_path, code, size_bytes, mtime_ms
FROM video_metadata
WHERE upper(code) = ?
ORDER BY rel_path COLLATE NOCASE
`).all(norm) as VideoMetaRow[];
return rows.map((r) => ({
abs: r.abs_path,
rel: r.rel_path,
filename: path.basename(r.abs_path),
code: r.code,
size: r.size_bytes,
mtime: r.mtime_ms,
}));
}
/** Set of every code present in video_metadata — fast existence check. */
export function getCodesWithVideos(): Set<string> {
const rows = rawDb.prepare(`
SELECT DISTINCT upper(code) AS code FROM video_metadata
`).all() as Array<{ code: string }>;
return new Set(rows.map((r) => r.code));
}
/** Set of every code with a discoverable subtitle sidecar. Reads from
* the images.has_subtitle column populated at rescan time. */
export function getCodesWithSubtitles(): Set<string> {
const rows = rawDb.prepare(`
SELECT DISTINCT upper(code) AS code FROM images WHERE has_subtitle = 1 AND code IS NOT NULL
`).all() as Array<{ code: string }>;
return new Set(rows.map((r) => r.code));
}