621 lines
22 KiB
TypeScript
621 lines
22 KiB
TypeScript
"use server";
|
|
import path from "node:path";
|
|
import fs from "node:fs/promises";
|
|
import sharp from "sharp";
|
|
import { rawDb } from "@/lib/db/client";
|
|
import { sanitizeFilename, uniqueFilePath, letterBucket, canonicalThumbName } from "@/lib/filename";
|
|
import { extractCode } from "@/lib/jav/codeParser";
|
|
import { computeDHash, hammingDistance } from "@/lib/jav/phash";
|
|
import { clearAppSettingsCache } from "@/lib/db/appSettings";
|
|
import { safeJoin } from "@/lib/safePath";
|
|
import { revalidatePath } from "next/cache";
|
|
|
|
const LIBRARY_ROOT = path.join(process.cwd(), "library");
|
|
const THUMB_ROOT = path.join(process.cwd(), "data", "thumbs");
|
|
const PORTRAIT_ROOT = path.join(process.cwd(), "data", "portraits");
|
|
const CATEGORY_COVER_ROOT = path.join(process.cwd(), "data", "category-covers");
|
|
const COLLECTION_COVER_ROOT = path.join(process.cwd(), "data", "collection-covers");
|
|
|
|
const SYSTEM_FILES = new Set([".ds_store", "thumbs.db", "desktop.ini"]);
|
|
|
|
interface OrphanReport {
|
|
libraryFiles: string[];
|
|
thumbFiles: string[];
|
|
portraitFiles: string[];
|
|
categoryCoverFiles: string[];
|
|
collectionCoverFiles: string[];
|
|
bytes: number;
|
|
}
|
|
|
|
async function walk(dir: string): Promise<string[]> {
|
|
let entries: import("node:fs").Dirent[] = [];
|
|
try {
|
|
entries = await fs.readdir(dir, { withFileTypes: true });
|
|
} catch {
|
|
return [];
|
|
}
|
|
const out: string[] = [];
|
|
await Promise.all(entries.map(async (e) => {
|
|
const full = path.join(dir, e.name);
|
|
if (e.isDirectory()) {
|
|
out.push(...(await walk(full)));
|
|
} else if (e.isFile() && !SYSTEM_FILES.has(e.name.toLowerCase())) {
|
|
out.push(full);
|
|
}
|
|
}));
|
|
return out;
|
|
}
|
|
|
|
async function findOrphans(): Promise<OrphanReport> {
|
|
const knownLibrary = new Set(
|
|
(rawDb.prepare(`SELECT rel_path FROM images`).all() as Array<{ rel_path: string }>)
|
|
.map((r) => path.normalize(r.rel_path)),
|
|
);
|
|
const knownThumbs = new Set(
|
|
(rawDb.prepare(`SELECT thumb_path FROM images`).all() as Array<{ thumb_path: string }>)
|
|
.map((r) => path.normalize(r.thumb_path)),
|
|
);
|
|
const knownPortraits = new Set(
|
|
(rawDb
|
|
.prepare(`
|
|
SELECT portrait_path AS p FROM actresses WHERE portrait_path IS NOT NULL
|
|
UNION ALL SELECT portrait2_path FROM actresses WHERE portrait2_path IS NOT NULL
|
|
UNION ALL SELECT portrait3_path FROM actresses WHERE portrait3_path IS NOT NULL
|
|
UNION ALL SELECT portrait4_path FROM actresses WHERE portrait4_path IS NOT NULL
|
|
UNION ALL SELECT portraith_path FROM actresses WHERE portraith_path IS NOT NULL
|
|
`)
|
|
.all() as Array<{ p: string }>)
|
|
.map((r) => path.normalize(r.p)),
|
|
);
|
|
const knownCategoryCovers = new Set(
|
|
(rawDb
|
|
.prepare(`
|
|
SELECT cover_portrait_path AS p FROM tag_categories WHERE cover_portrait_path IS NOT NULL
|
|
UNION ALL SELECT cover_landscape_path FROM tag_categories WHERE cover_landscape_path IS NOT NULL
|
|
`)
|
|
.all() as Array<{ p: string }>)
|
|
.map((r) => path.normalize(r.p)),
|
|
);
|
|
const knownCollectionCovers = new Set(
|
|
(rawDb
|
|
.prepare(`
|
|
SELECT cover_portrait_path AS p FROM collections WHERE cover_portrait_path IS NOT NULL
|
|
UNION ALL SELECT cover_landscape_path FROM collections WHERE cover_landscape_path IS NOT NULL
|
|
`)
|
|
.all() as Array<{ p: string }>)
|
|
.map((r) => path.normalize(r.p)),
|
|
);
|
|
|
|
const [libFiles, thumbFiles, portraitFiles, categoryCoverFiles, collectionCoverFiles] = await Promise.all([
|
|
walk(LIBRARY_ROOT),
|
|
walk(THUMB_ROOT),
|
|
walk(PORTRAIT_ROOT),
|
|
walk(CATEGORY_COVER_ROOT),
|
|
walk(COLLECTION_COVER_ROOT),
|
|
]);
|
|
|
|
const libraryOrphans = libFiles.filter((abs) => {
|
|
const rel = path.normalize(path.relative(LIBRARY_ROOT, abs));
|
|
return !knownLibrary.has(rel);
|
|
});
|
|
const thumbOrphans = thumbFiles.filter((abs) => {
|
|
const rel = path.normalize(path.relative(THUMB_ROOT, abs));
|
|
return !knownThumbs.has(rel);
|
|
});
|
|
const portraitOrphans = portraitFiles.filter((abs) => {
|
|
const rel = path.normalize(path.relative(PORTRAIT_ROOT, abs));
|
|
return !knownPortraits.has(rel);
|
|
});
|
|
const categoryCoverOrphans = categoryCoverFiles.filter((abs) => {
|
|
const rel = path.normalize(path.relative(CATEGORY_COVER_ROOT, abs));
|
|
return !knownCategoryCovers.has(rel);
|
|
});
|
|
const collectionCoverOrphans = collectionCoverFiles.filter((abs) => {
|
|
const rel = path.normalize(path.relative(COLLECTION_COVER_ROOT, abs));
|
|
return !knownCollectionCovers.has(rel);
|
|
});
|
|
|
|
let bytes = 0;
|
|
await Promise.all([
|
|
...libraryOrphans, ...thumbOrphans, ...portraitOrphans,
|
|
...categoryCoverOrphans, ...collectionCoverOrphans,
|
|
].map(async (f) => {
|
|
try { bytes += (await fs.stat(f)).size; } catch {}
|
|
}));
|
|
|
|
return {
|
|
libraryFiles: libraryOrphans,
|
|
thumbFiles: thumbOrphans,
|
|
portraitFiles: portraitOrphans,
|
|
categoryCoverFiles: categoryCoverOrphans,
|
|
collectionCoverFiles: collectionCoverOrphans,
|
|
bytes,
|
|
};
|
|
}
|
|
|
|
export async function previewOrphanFiles(): Promise<{ count: number; bytes: number }> {
|
|
const report = await findOrphans();
|
|
const count =
|
|
report.libraryFiles.length +
|
|
report.thumbFiles.length +
|
|
report.portraitFiles.length +
|
|
report.categoryCoverFiles.length +
|
|
report.collectionCoverFiles.length;
|
|
return { count, bytes: report.bytes };
|
|
}
|
|
|
|
export async function purgeOrphanFiles(): Promise<{ deleted: number; bytes: number }> {
|
|
const report = await findOrphans();
|
|
const all = [
|
|
...report.libraryFiles,
|
|
...report.thumbFiles,
|
|
...report.portraitFiles,
|
|
...report.categoryCoverFiles,
|
|
...report.collectionCoverFiles,
|
|
];
|
|
// Bound concurrency: Promise.all over thousands of fs.rm calls can
|
|
// exhaust file descriptors (EMFILE) on Windows / low-ulimit hosts.
|
|
const CONCURRENCY = 32;
|
|
for (let i = 0; i < all.length; i += CONCURRENCY) {
|
|
await Promise.all(all.slice(i, i + CONCURRENCY).map((f) => fs.rm(f, { force: true })));
|
|
}
|
|
// Sweep empty subdirs across every root that just shed files.
|
|
await Promise.all([
|
|
cleanEmptyDirs(LIBRARY_ROOT),
|
|
cleanEmptyDirs(THUMB_ROOT),
|
|
cleanEmptyDirs(PORTRAIT_ROOT),
|
|
cleanEmptyDirs(CATEGORY_COVER_ROOT),
|
|
cleanEmptyDirs(COLLECTION_COVER_ROOT),
|
|
]);
|
|
// Indexes that show cover/portrait/thumb counts need to refetch.
|
|
revalidatePath("/");
|
|
revalidatePath("/category");
|
|
revalidatePath("/collection");
|
|
revalidatePath("/actress");
|
|
return { deleted: all.length, bytes: report.bytes };
|
|
}
|
|
|
|
interface ReorganizePreview {
|
|
total: number;
|
|
toMove: number;
|
|
}
|
|
|
|
interface ImageRow {
|
|
id: number;
|
|
filename: string;
|
|
rel_path: string;
|
|
code: string | null;
|
|
parent_image_id: number | null;
|
|
}
|
|
|
|
/**
|
|
* Resolve the target letter-bucket directory for a row. Attached images
|
|
* (parent_image_id set) bucket with their parent's code so related files
|
|
* stay together on disk.
|
|
*/
|
|
function plannedDirRel(row: ImageRow, parentCodeById: Map<number, string | null>): string {
|
|
if (row.parent_image_id != null) {
|
|
const parentCode = parentCodeById.get(row.parent_image_id) ?? null;
|
|
return letterBucket(parentCode).dirRel;
|
|
}
|
|
return letterBucket(row.code).dirRel;
|
|
}
|
|
|
|
function loadAllImages(): { rows: ImageRow[]; parentCodeById: Map<number, string | null> } {
|
|
const rows = rawDb.prepare(`SELECT id, filename, rel_path, code, parent_image_id FROM images`).all() as ImageRow[];
|
|
const parentCodeById = new Map<number, string | null>();
|
|
for (const r of rows) parentCodeById.set(r.id, r.code);
|
|
return { rows, parentCodeById };
|
|
}
|
|
|
|
export async function previewReorganize(): Promise<ReorganizePreview> {
|
|
const { rows, parentCodeById } = loadAllImages();
|
|
let toMove = 0;
|
|
for (const r of rows) {
|
|
const target = plannedDirRel(r, parentCodeById);
|
|
const currentDir = path.posix.dirname(r.rel_path.replace(/\\/g, "/"));
|
|
if (currentDir !== target) toMove++;
|
|
}
|
|
return { total: rows.length, toMove };
|
|
}
|
|
|
|
export async function reorganizeFiles(): Promise<{ moved: number; skipped: number; errors: number }> {
|
|
const { rows, parentCodeById } = loadAllImages();
|
|
|
|
let moved = 0, skipped = 0, errors = 0;
|
|
for (const r of rows) {
|
|
const target = plannedDirRel(r, parentCodeById);
|
|
const currentDir = path.posix.dirname(r.rel_path.replace(/\\/g, "/"));
|
|
if (currentDir === target) { skipped++; continue; }
|
|
|
|
const oldAbs = path.join(LIBRARY_ROOT, r.rel_path);
|
|
try {
|
|
await fs.access(oldAbs);
|
|
} catch {
|
|
errors++;
|
|
continue;
|
|
}
|
|
|
|
const { base, ext } = sanitizeFilename(r.filename || `image${path.extname(r.rel_path)}`);
|
|
const dirAbs = path.join(LIBRARY_ROOT, target);
|
|
try {
|
|
await fs.mkdir(dirAbs, { recursive: true });
|
|
const newAbs = await uniqueFilePath(dirAbs, base, ext);
|
|
await fs.rename(oldAbs, newAbs);
|
|
const newRel = path.posix.join(target, path.basename(newAbs));
|
|
rawDb.prepare(`UPDATE images SET rel_path = ? WHERE id = ?`).run(newRel, r.id);
|
|
moved++;
|
|
} catch {
|
|
errors++;
|
|
}
|
|
}
|
|
|
|
await cleanEmptyDirs(LIBRARY_ROOT);
|
|
revalidatePath("/");
|
|
return { moved, skipped, errors };
|
|
}
|
|
|
|
export async function clearCache(): Promise<{ ok: true }> {
|
|
clearAppSettingsCache();
|
|
for (const p of ["/", "/collection", "/tag", "/category", "/actress", "/studios", "/series", "/genres", "/queue"]) {
|
|
revalidatePath(p);
|
|
}
|
|
return { ok: true };
|
|
}
|
|
|
|
export interface UndersizedCover {
|
|
id: number;
|
|
code: string | null;
|
|
filename: string;
|
|
width: number;
|
|
height: number;
|
|
bytes: number;
|
|
thumbPath: string;
|
|
}
|
|
|
|
/**
|
|
* Scan top-level covers whose pixel dimensions look smaller than a
|
|
* standard JAV cover (typically 800x538). Catches accidental imports of
|
|
* thumbnails, web previews, or other non-cover images.
|
|
*
|
|
* Defaults are deliberately permissive — the standard is 800x538 but real
|
|
* scans/rips drift by a few pixels in either direction. The 147x200
|
|
* outlier the user spotted falls well below the floor.
|
|
*/
|
|
export async function scanUndersizedCovers(opts?: {
|
|
minWidth?: number;
|
|
minHeight?: number;
|
|
}): Promise<UndersizedCover[]> {
|
|
const minW = opts?.minWidth ?? 750;
|
|
const minH = opts?.minHeight ?? 500;
|
|
return rawDb.prepare(`
|
|
SELECT id, code, filename, width, height, bytes, thumb_path AS thumbPath
|
|
FROM images
|
|
WHERE parent_image_id IS NULL
|
|
AND deleted_at IS NULL
|
|
AND (width < ? OR height < ?)
|
|
ORDER BY (width * height) ASC, id ASC
|
|
`).all(minW, minH) as UndersizedCover[];
|
|
}
|
|
|
|
interface RegenThumbsPreview {
|
|
total: number;
|
|
missing: number;
|
|
staleNames: number;
|
|
}
|
|
|
|
/**
|
|
* Resolve the planned canonical filename for a row: includes parent code
|
|
* lookup for attached images so back-covers inherit the prefix.
|
|
*/
|
|
function plannedThumbName(row: { sha256: string; code: string | null; parent_image_id: number | null }): string {
|
|
if (row.parent_image_id != null) {
|
|
const parent = rawDb.prepare(`SELECT code FROM images WHERE id = ?`).get(row.parent_image_id) as
|
|
| { code: string | null }
|
|
| undefined;
|
|
return canonicalThumbName(parent?.code ?? null, row.sha256);
|
|
}
|
|
return canonicalThumbName(row.code, row.sha256);
|
|
}
|
|
|
|
/** Count covers whose thumb file is missing on disk or whose stored name is stale. */
|
|
export async function previewRegenThumbnails(): Promise<RegenThumbsPreview> {
|
|
const rows = rawDb.prepare(`
|
|
SELECT thumb_path, sha256, code, parent_image_id FROM images WHERE deleted_at IS NULL
|
|
`).all() as Array<{ thumb_path: string; sha256: string; code: string | null; parent_image_id: number | null }>;
|
|
let missing = 0;
|
|
let staleNames = 0;
|
|
// Sequential is fine for personal-library scale; a bulk Promise.all here
|
|
// can blow up with EMFILE on very large libraries.
|
|
for (const r of rows) {
|
|
const target = plannedThumbName(r);
|
|
if (target !== r.thumb_path) staleNames++;
|
|
const targetAbs = path.join(THUMB_ROOT, target);
|
|
try { await fs.access(targetAbs); } catch { missing++; }
|
|
}
|
|
return { total: rows.length, missing, staleNames };
|
|
}
|
|
|
|
/**
|
|
* Rebuild thumbnails. Three paths per row:
|
|
* 1. Canonical file already on disk → skip (unless `force`).
|
|
* 2. Legacy file (different name from canonical) is on disk → rename it
|
|
* to canonical and update thumb_path. No re-encode needed; this is
|
|
* the migration path for libraries that predate the code-prefix
|
|
* naming.
|
|
* 3. Neither file is on disk → read original from library/ and encode
|
|
* from scratch.
|
|
*/
|
|
export async function regenerateThumbnails(opts?: { force?: boolean }): Promise<{ regenerated: number; renamed: number; skipped: number; errors: number }> {
|
|
const force = opts?.force ?? false;
|
|
const rows = rawDb.prepare(`
|
|
SELECT id, rel_path, thumb_path, sha256, code, parent_image_id FROM images WHERE deleted_at IS NULL
|
|
`).all() as Array<{ id: number; rel_path: string; thumb_path: string; sha256: string; code: string | null; parent_image_id: number | null }>;
|
|
|
|
await fs.mkdir(THUMB_ROOT, { recursive: true });
|
|
|
|
let regenerated = 0, renamed = 0, skipped = 0, errors = 0;
|
|
for (const r of rows) {
|
|
const target = plannedThumbName(r);
|
|
const targetAbs = path.join(THUMB_ROOT, target);
|
|
|
|
if (!force) {
|
|
try {
|
|
await fs.access(targetAbs);
|
|
// Canonical file exists. If the DB still has the legacy name,
|
|
// sync the column so future operations don't drift.
|
|
if (r.thumb_path !== target) {
|
|
rawDb.prepare(`UPDATE images SET thumb_path = ? WHERE id = ?`).run(target, r.id);
|
|
}
|
|
skipped++;
|
|
continue;
|
|
} catch { /* missing — fall through */ }
|
|
}
|
|
|
|
// Try the legacy/current path: if a thumb exists at the stored
|
|
// thumb_path that's different from canonical, rename it instead of
|
|
// re-encoding. Faster, lossless, preserves whatever the file already
|
|
// was.
|
|
if (r.thumb_path !== target) {
|
|
const oldAbs = safeJoin(THUMB_ROOT, r.thumb_path);
|
|
if (oldAbs) {
|
|
try {
|
|
await fs.access(oldAbs);
|
|
if (force) {
|
|
// Force mode: drop the old file and re-encode at canonical.
|
|
await fs.rm(oldAbs, { force: true }).catch(() => {});
|
|
} else {
|
|
await fs.rename(oldAbs, targetAbs);
|
|
rawDb.prepare(`UPDATE images SET thumb_path = ? WHERE id = ?`).run(target, r.id);
|
|
renamed++;
|
|
continue;
|
|
}
|
|
} catch { /* legacy file missing — fall through to encode */ }
|
|
}
|
|
}
|
|
|
|
const libAbs = safeJoin(LIBRARY_ROOT, r.rel_path);
|
|
if (!libAbs) {
|
|
errors++;
|
|
continue;
|
|
}
|
|
|
|
try {
|
|
// Pass the file path to sharp instead of reading into a buffer.
|
|
// The library can contain multi-GB videos that were misclassified
|
|
// as images; reading those into memory would OOM the server.
|
|
// sharp streams from disk and reports its own decode errors.
|
|
// Mirrors lib/ingest/ingest.ts's resize pipeline.
|
|
await sharp(libAbs, { failOn: "none" })
|
|
.rotate()
|
|
.resize({ width: 768, height: 768, fit: "inside", withoutEnlargement: true })
|
|
.webp({ quality: 82 })
|
|
.toFile(targetAbs);
|
|
if (r.thumb_path !== target) {
|
|
rawDb.prepare(`UPDATE images SET thumb_path = ? WHERE id = ?`).run(target, r.id);
|
|
}
|
|
regenerated++;
|
|
} catch {
|
|
errors++;
|
|
}
|
|
}
|
|
|
|
revalidatePath("/");
|
|
return { regenerated, renamed, skipped, errors };
|
|
}
|
|
|
|
async function cleanEmptyDirs(root: string): Promise<void> {
|
|
let entries: import("node:fs").Dirent[] = [];
|
|
try { entries = await fs.readdir(root, { withFileTypes: true }); } catch { return; }
|
|
for (const e of entries) {
|
|
if (!e.isDirectory()) continue;
|
|
const dir = path.join(root, e.name);
|
|
await cleanEmptyDirs(dir);
|
|
try {
|
|
const remaining = await fs.readdir(dir);
|
|
if (remaining.length === 0) await fs.rmdir(dir);
|
|
} catch {}
|
|
}
|
|
}
|
|
|
|
export interface ReparseCodesPreview {
|
|
total: number;
|
|
/** Rows with no code where extractCode now finds one — safe to fill. */
|
|
missing: number;
|
|
/** Rows where extractCode disagrees with the stored code — overwrite
|
|
* is destructive of any manual edit, so it's gated behind force=true. */
|
|
changed: number;
|
|
/** Sample of up to 20 changed rows for the preview UI. */
|
|
sampleChanges: Array<{ id: number; filename: string; oldCode: string; newCode: string }>;
|
|
}
|
|
|
|
/**
|
|
* Walk every top-level cover (parent_image_id IS NULL, not soft-deleted)
|
|
* and re-run extractCode against the stored filename. Reports how many
|
|
* rows would change so the user can preview before committing.
|
|
*/
|
|
export async function previewReparseCodes(): Promise<ReparseCodesPreview> {
|
|
const rows = rawDb.prepare(`
|
|
SELECT id, filename, code FROM images
|
|
WHERE deleted_at IS NULL AND parent_image_id IS NULL
|
|
`).all() as Array<{ id: number; filename: string; code: string | null }>;
|
|
let missing = 0, changed = 0;
|
|
const sampleChanges: ReparseCodesPreview["sampleChanges"] = [];
|
|
for (const r of rows) {
|
|
const extracted = extractCode(r.filename);
|
|
if (!extracted) continue;
|
|
if (r.code == null) {
|
|
missing++;
|
|
} else if (r.code !== extracted) {
|
|
changed++;
|
|
if (sampleChanges.length < 20) {
|
|
sampleChanges.push({ id: r.id, filename: r.filename, oldCode: r.code, newCode: extracted });
|
|
}
|
|
}
|
|
}
|
|
return { total: rows.length, missing, changed, sampleChanges };
|
|
}
|
|
|
|
/**
|
|
* Apply the re-parse. By default only fills rows with NULL code (safe);
|
|
* pass force=true to overwrite codes that disagree with extractCode.
|
|
*
|
|
* Note: this only updates the DB. Files won't move into their new
|
|
* letter buckets until you also run Reorganize. Same for thumbnail
|
|
* filenames — the code prefix in `<CODE>-<sha>.webp` won't update until
|
|
* Regenerate Thumbnails runs.
|
|
*/
|
|
export async function reparseCodes(opts?: { force?: boolean }): Promise<{ filled: number; updated: number; skipped: number }> {
|
|
const force = opts?.force ?? false;
|
|
const rows = rawDb.prepare(`
|
|
SELECT id, filename, code FROM images
|
|
WHERE deleted_at IS NULL AND parent_image_id IS NULL
|
|
`).all() as Array<{ id: number; filename: string; code: string | null }>;
|
|
|
|
let filled = 0, updated = 0, skipped = 0;
|
|
const tx = rawDb.transaction(() => {
|
|
const update = rawDb.prepare(`UPDATE images SET code = ? WHERE id = ?`);
|
|
for (const r of rows) {
|
|
const extracted = extractCode(r.filename);
|
|
if (!extracted) { skipped++; continue; }
|
|
if (r.code == null) {
|
|
update.run(extracted, r.id);
|
|
filled++;
|
|
} else if (r.code !== extracted) {
|
|
if (force) {
|
|
update.run(extracted, r.id);
|
|
updated++;
|
|
} else {
|
|
skipped++;
|
|
}
|
|
} else {
|
|
skipped++;
|
|
}
|
|
}
|
|
});
|
|
tx();
|
|
revalidatePath("/");
|
|
return { filled, updated, skipped };
|
|
}
|
|
|
|
export interface NearDupePair {
|
|
a: { id: number; code: string | null; filename: string; thumbPath: string; width: number; height: number; bytes: number };
|
|
b: { id: number; code: string | null; filename: string; thumbPath: string; width: number; height: number; bytes: number };
|
|
distance: number;
|
|
}
|
|
|
|
export interface NearDupesPreview {
|
|
total: number;
|
|
hashed: number;
|
|
unhashed: number;
|
|
}
|
|
|
|
/** Quick stats: how many rows already have a phash vs need backfilling. */
|
|
export async function previewNearDupes(): Promise<NearDupesPreview> {
|
|
const row = rawDb.prepare(`
|
|
SELECT
|
|
COUNT(*) AS total,
|
|
SUM(CASE WHEN phash IS NOT NULL THEN 1 ELSE 0 END) AS hashed
|
|
FROM images WHERE deleted_at IS NULL
|
|
`).get() as { total: number; hashed: number };
|
|
return {
|
|
total: row.total,
|
|
hashed: row.hashed,
|
|
unhashed: row.total - row.hashed,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Backfill `phash` for every row that doesn't have one yet. Reads the
|
|
* library file, computes dHash, writes to DB. Skips rows whose file is
|
|
* missing on disk.
|
|
*/
|
|
export async function backfillPhashes(): Promise<{ hashed: number; skipped: number; errors: number }> {
|
|
const rows = rawDb.prepare(`
|
|
SELECT id, rel_path FROM images
|
|
WHERE deleted_at IS NULL AND phash IS NULL
|
|
`).all() as Array<{ id: number; rel_path: string }>;
|
|
|
|
let hashed = 0, skipped = 0, errors = 0;
|
|
const update = rawDb.prepare(`UPDATE images SET phash = ? WHERE id = ?`);
|
|
for (const r of rows) {
|
|
const abs = safeJoin(LIBRARY_ROOT, r.rel_path);
|
|
if (!abs) { errors++; continue; }
|
|
try {
|
|
const buf = await fs.readFile(abs);
|
|
const hash = await computeDHash(buf);
|
|
update.run(hash, r.id);
|
|
hashed++;
|
|
} catch {
|
|
errors++;
|
|
}
|
|
}
|
|
return { hashed, skipped, errors };
|
|
}
|
|
|
|
/**
|
|
* Find pairs of covers whose dHashes are within `threshold` Hamming
|
|
* distance. Brute force O(n²); fine for personal-library scale (5k
|
|
* covers ≈ 12.5M comparisons, runs in well under a second).
|
|
*
|
|
* Excludes pairs that are already SHA-identical (those are caught by
|
|
* upload dedup) and excludes attached-image pairs (those are
|
|
* intentionally similar to their parent).
|
|
*
|
|
* Default threshold = 10 (out of 64 bits) is a strong "same image,
|
|
* different encode" signal.
|
|
*/
|
|
export async function findNearDuplicates(opts?: { threshold?: number; limit?: number }): Promise<NearDupePair[]> {
|
|
const threshold = opts?.threshold ?? 10;
|
|
const limit = opts?.limit ?? 200;
|
|
const rows = rawDb.prepare(`
|
|
SELECT id, code, filename, rel_path, thumb_path AS thumbPath, sha256, phash, width, height, bytes
|
|
FROM images
|
|
WHERE deleted_at IS NULL AND parent_image_id IS NULL AND phash IS NOT NULL
|
|
ORDER BY id ASC
|
|
`).all() as Array<{
|
|
id: number; code: string | null; filename: string; rel_path: string; thumbPath: string;
|
|
sha256: string; phash: string; width: number; height: number; bytes: number;
|
|
}>;
|
|
|
|
const pairs: NearDupePair[] = [];
|
|
for (let i = 0; i < rows.length && pairs.length < limit; i++) {
|
|
for (let j = i + 1; j < rows.length && pairs.length < limit; j++) {
|
|
const a = rows[i];
|
|
const b = rows[j];
|
|
if (a.sha256 === b.sha256) continue; // SHA-identical pairs handled elsewhere
|
|
const d = hammingDistance(a.phash, b.phash);
|
|
if (d <= threshold) {
|
|
pairs.push({
|
|
a: { id: a.id, code: a.code, filename: a.filename, thumbPath: a.thumbPath, width: a.width, height: a.height, bytes: a.bytes },
|
|
b: { id: b.id, code: b.code, filename: b.filename, thumbPath: b.thumbPath, width: b.width, height: b.height, bytes: b.bytes },
|
|
distance: d,
|
|
});
|
|
}
|
|
}
|
|
}
|
|
// Sort tightest matches first, then by lowest id pair for stability.
|
|
pairs.sort((x, y) => x.distance - y.distance || x.a.id - y.a.id || x.b.id - y.b.id);
|
|
return pairs;
|
|
}
|