Step 11: benchmark host fast-path, decision = keep

Adds benchmarks/host-fast-path.py and benchmarks/README.md. The benchmark compares two paths for a cached single-ID search: 1. fast-path: in-process dict walk inside the native host (handle_cached_search_fast in rcjav-host.py) 2. subprocess: shell out to `rc-jav.py --search ID --cache --format json` Idle baseline against the live 7124-file cache (5 queries × 5 iter): fast-path: median 0.46ms p95 0.61ms max 0.72ms subprocess: median 919ms p95 1233ms max 1385ms median speedup: 2000x Decision: keep the fast path. The ~920ms subprocess cost is dominated by Python interpreter startup + 1.3MB cache.json parse. That's structural — it applies under idle Python too, not just when a scan is running. The "Python actively scanning" condition from the original roadmap doesn't change the verdict; it would only make the subprocess path even slower while leaving the in-process path unaffected (the fast path doesn't touch the scanning process). The fast path is already correctly scoped — bails out for wildcards, ranges, name searches, and --quick mode. Narrowing further would just push more queries through the slow path with no upside. Possible follow-up (not in scope here): memoize _load_host_cache with mtime-based invalidation so the fast path doesn't reparse cache.json on every call. Current per-call median (0.46ms) is already fast enough that this is optional. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-23 11:12:33 +02:00
parent 66f82eb214
commit b9a24b3fb5
2 changed files with 217 additions and 0 deletions
@@ -0,0 +1,151 @@
+"""Measure host fast-path vs subprocess rc-jav.py for cached single-ID search.
+
+Step 11 of the console-consolidation roadmap asks: does the host's
+`handle_cached_search_fast` actually save meaningful latency vs just
+shelling out to `rc-jav.py --search ID --format json --quick`? If yes,
+under what conditions (idle Python vs Python actively scanning)?
+
+This script runs both paths N times against a set of query IDs and
+reports min / median / mean / p95 / max in milliseconds.
+
+Usage:
+    python benchmarks/host-fast-path.py [--queries Q1 Q2 ...] [--iterations N]
+
+To measure (b) Python-actively-scanning, kick off a `rc-jav.py --scan` in
+another terminal, then run this script while the scan runs.
+
+The fast-path implementation is replicated inline here (not imported
+from the host module) so the benchmark is self-contained.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+
+from rcjav.cache import load_cache  # noqa: E402
+from rcjav.ids import current_rules_signature, normalize_id  # noqa: E402
+
+
+DEFAULT_QUERIES = ["SSIS-001", "ABP-100", "FC2-1841460", "MIDD-500", "IBW-902"]
+DEFAULT_ITERATIONS = 20
+
+
+def fast_path_search(cache: dict, query: str) -> int:
+    """Replicates handle_cached_search_fast minus the response shape.
+
+    Returns hit count. Walks every remote's files[] looking for jav_id
+    matching the normalized query (exact or `<id>#partN`).
+    """
+    norm = normalize_id(query)
+    if not norm:
+        return 0
+    hits = 0
+    for remote, entry in (cache.get("remotes") or {}).items():
+        files = entry.get("files") or []
+        for item in files:
+            jid = item.get("jav_id", "")
+            if jid == norm or (isinstance(jid, str) and jid.startswith(norm + "#part")):
+                hits += 1
+    return hits
+
+
+def time_fast_path(query: str, iterations: int) -> list[float]:
+    sig = current_rules_signature()
+    cache = load_cache(sig)
+    out: list[float] = []
+    for _ in range(iterations):
+        t0 = time.perf_counter()
+        fast_path_search(cache, query)
+        out.append((time.perf_counter() - t0) * 1000)
+    return out
+
+
+def time_subprocess(query: str, iterations: int) -> list[float]:
+    cmd = [
+        sys.executable,
+        str(ROOT / "rc-jav.py"),
+        "--search", query,
+        "--cache",        # force cache mode (no rclone)
+        "--format", "json",
+        "--basic", "--no-color",
+    ]
+    out: list[float] = []
+    for _ in range(iterations):
+        t0 = time.perf_counter()
+        proc = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", errors="replace")
+        out.append((time.perf_counter() - t0) * 1000)
+        if proc.returncode not in (0, 1):  # 1 = no hits, still valid
+            sys.stderr.write(f"subprocess returned {proc.returncode}; stderr={proc.stderr[:200]!r}\n")
+    return out
+
+
+def percentile(values: list[float], p: float) -> float:
+    if not values:
+        return 0.0
+    s = sorted(values)
+    k = (len(s) - 1) * p
+    f = int(k)
+    c = min(f + 1, len(s) - 1)
+    return s[f] + (s[c] - s[f]) * (k - f)
+
+
+def summarize(label: str, values: list[float]) -> None:
+    if not values:
+        print(f"  {label}: (no data)")
+        return
+    print(f"  {label}:")
+    print(f"    n={len(values)}  min={min(values):.2f}ms  median={statistics.median(values):.2f}ms  "
+          f"mean={statistics.mean(values):.2f}ms  p95={percentile(values, 0.95):.2f}ms  max={max(values):.2f}ms")
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--queries", nargs="+", default=DEFAULT_QUERIES,
+                    help=f"JAV IDs to search (default: {DEFAULT_QUERIES})")
+    ap.add_argument("--iterations", type=int, default=DEFAULT_ITERATIONS,
+                    help=f"Iterations per query per path (default: {DEFAULT_ITERATIONS})")
+    args = ap.parse_args()
+
+    print(f"Host fast-path vs subprocess rc-jav.py benchmark")
+    print(f"queries: {args.queries}")
+    print(f"iterations per path: {args.iterations}")
+    print(f"cache: {ROOT / 'cache.json'}")
+    print()
+
+    all_fast: list[float] = []
+    all_sub: list[float] = []
+    for q in args.queries:
+        print(f"[{q}]")
+        fast = time_fast_path(q, args.iterations)
+        summarize("fast-path (in-process dict walk)", fast)
+        sub = time_subprocess(q, args.iterations)
+        summarize("subprocess rc-jav.py --search --cache", sub)
+        all_fast.extend(fast)
+        all_sub.extend(sub)
+        if fast and sub:
+            speedup = statistics.median(sub) / max(statistics.median(fast), 0.001)
+            print(f"  speedup (median sub / median fast): {speedup:.1f}x")
+        print()
+
+    print("=== aggregate ===")
+    summarize("fast-path total", all_fast)
+    summarize("subprocess total", all_sub)
+    if all_fast and all_sub:
+        med_speedup = statistics.median(all_sub) / max(statistics.median(all_fast), 0.001)
+        p95_speedup = percentile(all_sub, 0.95) / max(percentile(all_fast, 0.95), 0.001)
+        print(f"  median speedup: {med_speedup:.1f}x")
+        print(f"  p95 speedup:    {p95_speedup:.1f}x")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())