pinkudex/lib/video/index.ts

import "server-only";
import path from "node:path";
import fs from "node:fs/promises";
import { extractCode, normalizeCode } from "@/lib/jav/codeParser";
import { getAppSetting } from "@/lib/db/appSettings";
import { rawDb } from "@/lib/db/client";
import { syncVideoMetadataIndex } from "./metadata";

export const VIDEO_EXTENSIONS = new Set([
  ".mp4", ".mkv", ".m4v", ".mov", ".webm", ".avi", ".wmv", ".ts", ".mpg", ".mpeg", ".flv",
]);

const SUBTITLE_EXTENSIONS = new Set([".srt", ".vtt", ".ass", ".ssa"]);

/** One video file the index found on disk. */
export interface VideoFile {
  /** Absolute path on disk. */
  abs: string;
  /** Path relative to the configured video library root. */
  rel: string;
  /** Filename (with extension). */
  filename: string;
  /** Normalized JAV code parsed from the filename. */
  code: string;
  /** File size in bytes. */
  size: number;
  /** Last-modified timestamp (ms). */
  mtime: number;
}

/**
 * Lightweight scan-state record. Authoritative file data lives in the
 * `video_metadata` SQLite table — accessors below query it directly,
 * so this struct holds only what describes the most recent rescan.
 */
interface VideoIndex {
  /** When the index was last built. */
  lastScannedAt: number;
  /** All folder roots that were scanned, in order: main first, extras after.
   *  Used both to display in the UI and to detect setting changes. */
  rootsScanned: string[];
  /** Total files matched by the most recent scan. */
  count: number;
}

const EMPTY_INDEX: VideoIndex = {
  lastScannedAt: 0,
  rootsScanned: [],
  count: 0,
};

let cachedScanState: VideoIndex = EMPTY_INDEX;
let scanInFlight: Promise<VideoIndex> | null = null;

interface CachedFileRow {
  abs_path: string;
  rel_path: string;
  code: string;
  size_bytes: number;
  mtime_ms: number;
}

interface WalkOpts {
  /** When true, ignore the dir-mtime cache and re-readdir every dir.
   *  Use after structural file edits that don't change dir mtime
   *  (e.g. content rewrite without rename). */
  force?: boolean;
}

/**
 * Walk the configured roots and produce a flat VideoFile[]. The caller
 * writes the result to the `video_metadata` table — nothing is held in
 * memory beyond the duration of one rescan.
 *
 * Incremental: each directory's mtime is compared to a stored value
 * in `video_dir_mtimes`. If unchanged, the immediate-children file
 * rows for that dir are reused from `video_metadata` instead of
 * readdir + stat per file. Subdirs are still walked (their mtimes
 * may have changed independently).
 */
async function walkAllRoots(
  roots: string[],
  opts: WalkOpts = {},
): Promise<{ files: VideoFile[]; count: number; visitedDirs: Set<string>; reused: number; rescanned: number }> {
  const cachedMtimes = opts.force
    ? new Map<string, number>()
    : loadDirMtimeCache();
  const visitedDirs = new Set<string>();
  const files: VideoFile[] = [];
  const cachedFilesByDir = opts.force
    ? new Map<string, CachedFileRow[]>()
    : loadCachedFileIndex();

  let reused = 0;
  let rescanned = 0;

  for (const root of roots) {
    type Frame = { dir: string };
    const stack: Frame[] = [{ dir: root }];
    while (stack.length) {
      const { dir } = stack.pop()!;
      visitedDirs.add(dir);
      let dirStat: import("node:fs").Stats;
      try {
        dirStat = await fs.stat(dir);
      } catch {
        continue; // dir vanished mid-walk
      }
      const cachedMtime = cachedMtimes.get(dir);
      const dirUnchanged = cachedMtime != null && cachedMtime === dirStat.mtimeMs;

      // Always recurse — subdir mtimes are tracked independently.
      // For *children* enumeration we use cached rows when unchanged.
      // We still need the subdir list either way; if we're skipping
      // the readdir for cache reuse, we need an alternate way to find
      // subdirs. Cheapest: readdir the directory entries once for
      // dirs (tiny per-dir cost) and use the dirent type directly.
      let entries: import("node:fs").Dirent[];
      try {
        entries = await fs.readdir(dir, { withFileTypes: true });
      } catch {
        continue;
      }

      // Push subdirs onto the stack regardless of cache state.
      for (const e of entries) {
        if (e.isDirectory()) {
          stack.push({ dir: path.join(dir, e.name) });
        }
      }

      if (dirUnchanged) {
        // Reuse cached rows for files immediately in this directory.
        const cached = cachedFilesByDir.get(dir);
        if (cached) {
          for (const row of cached) {
            files.push({
              abs: row.abs_path,
              rel: path.relative(root, row.abs_path),
              filename: path.basename(row.abs_path),
              code: row.code,
              size: row.size_bytes,
              mtime: row.mtime_ms,
            });
          }
          reused += cached.length;
        }
        continue;
      }

      // Dir changed (or no cache entry yet). Readdir + stat each file.
      rescanned++;
      for (const e of entries) {
        if (!e.isFile()) continue;
        const ext = path.extname(e.name).toLowerCase();
        if (!VIDEO_EXTENSIONS.has(ext)) continue;
        const abs = path.join(dir, e.name);
        const stem = e.name.slice(0, e.name.length - ext.length);
        const code = extractCode(stem);
        if (!code) continue;
        const norm = normalizeCode(code);
        if (!norm) continue;
        let st: import("node:fs").Stats;
        try {
          st = await fs.stat(abs);
        } catch {
          continue;
        }
        files.push({
          abs,
          rel: path.relative(root, abs),
          filename: e.name,
          code: norm,
          size: st.size,
          mtime: st.mtimeMs,
        });
      }
      // Update cached mtime so the NEXT scan sees this dir as fresh.
      cachedMtimes.set(dir, dirStat.mtimeMs);
    }
  }

  // Persist updated mtime cache for next scan.
  saveDirMtimeCache(cachedMtimes, visitedDirs);

  // Stable order across rescans.
  files.sort((a, b) => a.code.localeCompare(b.code) || a.filename.localeCompare(b.filename));
  return { files, count: files.length, visitedDirs, reused, rescanned };
}

/** Load all `video_dir_mtimes` rows into a Map keyed by abs_dir. */
function loadDirMtimeCache(): Map<string, number> {
  const rows = rawDb.prepare(`SELECT abs_dir, mtime_ms FROM video_dir_mtimes`).all() as Array<{ abs_dir: string; mtime_ms: number }>;
  const out = new Map<string, number>();
  for (const r of rows) out.set(r.abs_dir, r.mtime_ms);
  return out;
}

/** Group the entire video_metadata table by dir_path so dir-cache
 *  reuse is a single in-memory lookup per dir. One linear scan of the
 *  table — cheap even at 80k rows. */
function loadCachedFileIndex(): Map<string, CachedFileRow[]> {
  const rows = rawDb.prepare(`
    SELECT abs_path, rel_path, code, size_bytes, mtime_ms, dir_path
    FROM video_metadata
  `).all() as Array<CachedFileRow & { dir_path: string }>;
  const out = new Map<string, CachedFileRow[]>();
  for (const r of rows) {
    const arr = out.get(r.dir_path);
    if (arr) arr.push(r);
    else out.set(r.dir_path, [r]);
  }
  return out;
}

/** Upsert dir mtimes for visited dirs and prune rows for dirs we
 *  didn't see this scan (deleted folders). */
function saveDirMtimeCache(mtimes: Map<string, number>, visited: Set<string>): void {
  const upsert = rawDb.prepare(`
    INSERT INTO video_dir_mtimes (abs_dir, mtime_ms, last_seen_at)
    VALUES (?, ?, ?)
    ON CONFLICT(abs_dir) DO UPDATE SET
      mtime_ms = excluded.mtime_ms,
      last_seen_at = excluded.last_seen_at
  `);
  const now = Date.now();
  const tx = rawDb.transaction(() => {
    for (const [dir, mtime] of mtimes) {
      // Only persist dirs we actually visited this scan — others may
      // have been moved/renamed and their cache entry is stale.
      if (!visited.has(dir)) continue;
      upsert.run(dir, mtime, now);
    }
    // Prune rows whose dir we didn't see this scan. Drops cleanup of
    // deleted dirs in O(rows) — fine at any reasonable scale.
    const allRows = rawDb.prepare(`SELECT abs_dir FROM video_dir_mtimes`).all() as Array<{ abs_dir: string }>;
    const del = rawDb.prepare(`DELETE FROM video_dir_mtimes WHERE abs_dir = ?`);
    for (const r of allRows) {
      if (!visited.has(r.abs_dir)) del.run(r.abs_dir);
    }
  });
  try {
    tx();
  } catch (e) {
    console.error("[video] failed to save dir mtime cache:", e);
  }
}

/**
 * Walk every place a sidecar subtitle could live and return the set of
 * canonical codes that have at least one. Cheap signal — no ffprobe.
 *
 *  - Each video's own directory, filtered to filenames that start with
 *    the video stem (so a stray `OTHER-001.srt` next to `YUJ-001.mp4`
 *    doesn't taint YUJ-001).
 *  - Each entry in `subtitleExtraPaths` (recursive walk, depth 3) —
 *    extracts the code from the filename directly.
 *  - data/generated-subtitles/<code>/ — directory name IS the code.
 *
 * Result is consumed once by syncHasSubtitleColumn and discarded — no
 * persistent in-memory copy.
 */
async function collectSubtitleCodes(files: VideoFile[]): Promise<Set<string>> {
  const codes = new Set<string>();

  // Same-folder scan: per video, look at sibling files. Cache directory
  // listings so a folder with N videos is only listed once.
  const dirCache = new Map<string, import("node:fs").Dirent[]>();
  for (const file of files) {
    const dir = path.dirname(file.abs);
    let entries = dirCache.get(dir);
    if (!entries) {
      try {
        entries = await fs.readdir(dir, { withFileTypes: true });
      } catch {
        entries = [];
      }
      dirCache.set(dir, entries);
    }
    const stem = file.filename.slice(0, file.filename.length - path.extname(file.filename).length);
    const stemLower = stem.toLowerCase();
    const codeLower = file.code.toLowerCase();
    for (const e of entries) {
      if (!e.isFile()) continue;
      const ext = path.extname(e.name).toLowerCase();
      if (!SUBTITLE_EXTENSIONS.has(ext)) continue;
      const lower = e.name.toLowerCase();
      // Code-substring match must treat the code as a delimited token
      // (start, end, or wrapped in non-alphanumeric) — bare `.includes`
      // would attribute `notes-yuj-001-bad.srt` to YUJ-001.
      const codeAsToken = (() => {
        const idx = lower.indexOf(codeLower);
        if (idx < 0) return false;
        const before = idx === 0 ? "" : lower[idx - 1]!;
        const afterIdx = idx + codeLower.length;
        const after = afterIdx >= lower.length ? "" : lower[afterIdx]!;
        const isBoundary = (c: string) => c === "" || !/[a-z0-9]/.test(c);
        return isBoundary(before) && isBoundary(after);
      })();
      if (lower.startsWith(stemLower + ".") || lower === stemLower + ext || codeAsToken) {
        codes.add(file.code);
        break;
      }
    }
  }

  // Persistent subtitle library roots — extract codes from filenames.
  const extraRoots = (getAppSetting("subtitleExtraPaths") ?? []).filter(Boolean);
  for (const root of extraRoots) {
    await walkSubtitleRoot(root, codes, 3);
  }

  // data/generated-subtitles/<code>/ — directory name is the code.
  const generatedRoot = path.join(process.cwd(), "data", "generated-subtitles");
  try {
    const subdirs = await fs.readdir(generatedRoot, { withFileTypes: true });
    for (const d of subdirs) {
      if (!d.isDirectory()) continue;
      const dirAbs = path.join(generatedRoot, d.name);
      let entries: import("node:fs").Dirent[];
      try {
        entries = await fs.readdir(dirAbs, { withFileTypes: true });
      } catch {
        continue;
      }
      const hasSub = entries.some(
        (e) => e.isFile() && SUBTITLE_EXTENSIONS.has(path.extname(e.name).toLowerCase()),
      );
      if (hasSub) {
        const norm = normalizeCode(d.name);
        if (norm) codes.add(norm);
      }
    }
  } catch { /* generated-subtitles not present yet — fine */ }

  return codes;
}

async function walkSubtitleRoot(root: string, out: Set<string>, maxDepth: number): Promise<void> {
  type Frame = { dir: string; depth: number };
  const stack: Frame[] = [{ dir: root, depth: 0 }];
  while (stack.length) {
    const { dir, depth } = stack.pop()!;
    let entries: import("node:fs").Dirent[];
    try {
      entries = await fs.readdir(dir, { withFileTypes: true });
    } catch {
      continue;
    }
    for (const e of entries) {
      const full = path.join(dir, e.name);
      if (e.isDirectory()) {
        if (depth < maxDepth) stack.push({ dir: full, depth: depth + 1 });
      } else if (e.isFile()) {
        const ext = path.extname(e.name).toLowerCase();
        if (!SUBTITLE_EXTENSIONS.has(ext)) continue;
        const stem = e.name.slice(0, e.name.length - ext.length);
        const code = extractCode(stem);
        if (!code) continue;
        const norm = normalizeCode(code);
        if (norm) out.add(norm);
      }
    }
  }
}

/** Exposed for path-allowlist checks (e.g. subtitle file resolution). */
export function getConfiguredVideoRoots(): string[] {
  return configuredRoots();
}

function configuredRoots(): string[] {
  const main = (getAppSetting("videoLibraryPath") || "").trim();
  const extras = getAppSetting("videoExtraPaths") ?? [];
  const out: string[] = [];
  if (main) out.push(main);
  for (const e of extras) {
    const t = (e ?? "").trim();
    if (t) out.push(t);
  }
  return out;
}

function rootsEqual(a: string[], b: string[]): boolean {
  if (a.length !== b.length) return false;
  for (let i = 0; i < a.length; i++) if (a[i] !== b[i]) return false;
  return true;
}

/**
 * Scan-state probe — used by API routes to decide whether the cached
 * data still matches current settings. Returns the empty state if the
 * configured roots have changed (caller can trigger a rescan).
 */
export function getVideoIndex(): VideoIndex {
  const roots = configuredRoots();
  if (roots.length === 0) return EMPTY_INDEX;
  if (!rootsEqual(cachedScanState.rootsScanned, roots)) return EMPTY_INDEX;
  return cachedScanState;
}

/** Rebuild the index from disk. Coalesces concurrent calls. Authoritative
 *  data lands in the `video_metadata` table; this function returns only
 *  scan-state metadata.
 *
 *  Default mode is incremental — directories whose mtime hasn't
 *  changed since the last scan reuse cached file rows without
 *  readdir-per-file. Pass `{force:true}` to bypass the dir-mtime
 *  cache (e.g. after content edits that don't bump dir mtime). */
export async function rescanVideoIndex(opts: { force?: boolean } = {}): Promise<VideoIndex> {
  const roots = configuredRoots();
  if (scanInFlight) return scanInFlight;
  scanInFlight = (async () => {
    try {
      const cleanRoots = roots.map((r) => (r ?? "").trim()).filter(Boolean);
      if (cleanRoots.length === 0) {
        cachedScanState = { ...EMPTY_INDEX };
        return cachedScanState;
      }
      const t0 = Date.now();
      const { files, count, reused, rescanned } = await walkAllRoots(cleanRoots, { force: opts.force });
      const walkMs = Date.now() - t0;
      console.log(
        `[video] rescan walk in ${walkMs}ms — ${count} files (${reused} reused, ${rescanned} dir(s) rewalked${opts.force ? ", forced" : ""})`,
      );
      // Persist the file table first — has_video / has_subtitle bulk
      // updates and metadata sync all run off it.
      await syncVideoMetadataIndex(files);
      syncHasVideoColumn(files);
      const subtitleCodes = await collectSubtitleCodes(files);
      syncHasSubtitleColumn(subtitleCodes);

      cachedScanState = {
        lastScannedAt: Date.now(),
        rootsScanned: cleanRoots,
        count,
      };
      return cachedScanState;
    } finally {
      scanInFlight = null;
    }
  })();
  return scanInFlight;
}

/**
 * Mirror the freshly-walked code list into images.has_video so SQL
 * filters / counts can use the column directly.
 */
function syncHasVideoColumn(files: VideoFile[]): void {
  const codes = Array.from(new Set(files.map((f) => f.code)));
  const tx = rawDb.transaction(() => {
    rawDb.prepare(`UPDATE images SET has_video = 0 WHERE has_video = 1`).run();
    if (codes.length === 0) return;
    // Chunk to stay well below SQLite's bind-parameter cap.
    const CHUNK = 500;
    for (let i = 0; i < codes.length; i += CHUNK) {
      const slice = codes.slice(i, i + CHUNK);
      const placeholders = slice.map(() => "?").join(",");
      rawDb.prepare(
        `UPDATE images SET has_video = 1 WHERE upper(code) IN (${placeholders})`,
      ).run(...slice);
    }
  });
  try {
    tx();
  } catch (e) {
    console.error("[video] failed to sync has_video column:", e);
  }
}

/** Mirror the freshly-walked subtitle code set into images.has_subtitle. */
function syncHasSubtitleColumn(subtitleCodes: Set<string>): void {
  const codes = Array.from(subtitleCodes);
  const tx = rawDb.transaction(() => {
    rawDb.prepare(`UPDATE images SET has_subtitle = 0 WHERE has_subtitle = 1`).run();
    if (codes.length === 0) return;
    const CHUNK = 500;
    for (let i = 0; i < codes.length; i += CHUNK) {
      const slice = codes.slice(i, i + CHUNK);
      const placeholders = slice.map(() => "?").join(",");
      rawDb.prepare(
        `UPDATE images SET has_subtitle = 1 WHERE upper(code) IN (${placeholders})`,
      ).run(...slice);
    }
  });
  try {
    tx();
  } catch (e) {
    console.error("[video] failed to sync has_subtitle column:", e);
  }
}

interface VideoMetaRow {
  abs_path: string;
  rel_path: string;
  code: string;
  size_bytes: number;
  mtime_ms: number;
}

/** Look up files for a single normalized code. Reads directly from the
 *  video_metadata table so the result is always current with the most
 *  recent rescan. */
export function findVideosForCode(code: string | null | undefined): VideoFile[] {
  if (!code) return [];
  const norm = normalizeCode(code) ?? code.toUpperCase();
  const rows = rawDb.prepare(`
    SELECT abs_path, rel_path, code, size_bytes, mtime_ms
    FROM video_metadata
    WHERE upper(code) = ?
    ORDER BY rel_path COLLATE NOCASE
  `).all(norm) as VideoMetaRow[];
  return rows.map((r) => ({
    abs: r.abs_path,
    rel: r.rel_path,
    filename: path.basename(r.abs_path),
    code: r.code,
    size: r.size_bytes,
    mtime: r.mtime_ms,
  }));
}

/** Set of every code present in video_metadata — fast existence check. */
export function getCodesWithVideos(): Set<string> {
  const rows = rawDb.prepare(`
    SELECT DISTINCT upper(code) AS code FROM video_metadata
  `).all() as Array<{ code: string }>;
  return new Set(rows.map((r) => r.code));
}

/** Set of every code with a discoverable subtitle sidecar. Reads from
 *  the images.has_subtitle column populated at rescan time. */
export function getCodesWithSubtitles(): Set<string> {
  const rows = rawDb.prepare(`
    SELECT DISTINCT upper(code) AS code FROM images WHERE has_subtitle = 1 AND code IS NOT NULL
  `).all() as Array<{ code: string }>;
  return new Set(rows.map((r) => r.code));
}