Initial snapshot before step 10 package split

This commit is contained in:
admin
2026-05-22 21:39:09 +02:00
commit e029e898e9
16 changed files with 3955 additions and 0 deletions
+83
View File
@@ -0,0 +1,83 @@
# Shared JAV ID fixture corpus
JSON cases shared between the Python `rc-jav.py` CLI and the browser
extension at `D:\DEV\Extensions\Production\rclone-jav\`. Each side
reads the cases relevant to its own extraction surface.
## Files
| File | Domain | Consumer | Notes |
|-------------------------------|----------|----------------------------------------|-------|
| `filename-extraction.json` | filename | Python `extract_id(name)` | Has `#partN` expectations for multipart files |
| `query-extraction.json` | query | Extension `content.js` `normalizeId` | Looser context; extension never emits part suffix |
| `shared-normalization.json` | shared | BOTH | Contract: any mismatch here is a bug, not a fixture issue |
All files share the same shape:
```json
{
"version": 1,
"domain": "…",
"description": "…",
"case_schema": { },
"cases": [
{ "name": "…", "input": "…", "expected": "…" }
]
}
```
`expected: null` means "no ID should be detected".
## Running the Python side
```bash
python fixtures/run.py
```
The runner imports `rc-jav.py` in place, exercises `extract_id` against
`filename-extraction.json`, and `normalize_id` against
`shared-normalization.json`. Exit code is non-zero on any failure.
## Running the extension side
No automated runner today. `content.js` lives inside an IIFE that the
browser injects into pages, so importing it from Node would require
either an extraction refactor or a duplicated copy of the regex. Until
that lands, treat `query-extraction.json` and `shared-normalization.json`
as the canonical specification: if you touch `ID_RE_DASHED`,
`ID_RE_UNDASHED`, or `BUILTIN_ID_NORMALIZERS` in content.js, eyeball
this corpus and confirm the cases still describe expected behavior.
## Adding a case
1. Pick the file matching the surface you're testing.
2. Append a `{ "name", "input", "expected" }` entry. Keep `name`
descriptive — it's the only label shown when the runner fails.
3. If the case exercises a guarantee both sides must honor, add it to
`shared-normalization.json` as well.
4. Run `python fixtures/run.py` to confirm Python still passes.
## Known cross-side divergences (intentional)
These are NOT bugs — they reflect the different surfaces each side
extracts from. Recorded here so future contributors don't try to
"fix" them.
- **`FC2PPV1841460` compact form (no dashes).** The extension's
`BUILTIN_ID_NORMALIZERS` in `content.js` rewrites this to
`FC2-PPV-1841460` when seen in page titles. Python `extract_id`
does NOT — the compact form doesn't realistically appear in
filenames on disk. Hence the case lives in
`query-extraction.json` only, not in `filename-extraction.json` or
`shared-normalization.json`.
If a case belongs to one side's contract but not the other's, file it
under the specific domain (`filename-` or `query-`) — not under
`shared-`.
## Ownership
This directory lives in the Python repo only because the Python repo
is the more stable root. Conceptually it's joint property of both
codebases. Don't add anything Python-specific to the JSON files — keep
them tool-neutral.
+24
View File
@@ -0,0 +1,24 @@
{
"version": 1,
"domain": "filename",
"description": "Filename → canonical JAV ID (with optional #partN suffix). Consumed by Python rc-jav.extract_id.",
"case_schema": {
"name": "human label",
"input": "filename including extension",
"expected": "canonical ID (e.g. ABC-001 or ABC-001#part1) or null when no ID present"
},
"cases": [
{ "name": "plain dashed ID", "input": "ABC-027.mp4", "expected": "ABC-027" },
{ "name": "dashed ID with resolution tag", "input": "SCOP-297 [1080p].mp4", "expected": "SCOP-297" },
{ "name": "bracket-wrapped ID", "input": "[REAL-779].mp4", "expected": "REAL-779" },
{ "name": "bracket-wrapped ID with extra tag", "input": "[SCOP-297] [1080p].mp4", "expected": "SCOP-297" },
{ "name": "no-hyphen fallback", "input": "MVSD312.avi", "expected": "MVSD-312" },
{ "name": "trailing lowercase variant letter", "input": "IBW-902z.mp4", "expected": "IBW-902z" },
{ "name": "multipart _PART suffix", "input": "KV-118 - Aiba Reika_PART1.mp4", "expected": "KV-118#part1" },
{ "name": "multipart _A letter suffix", "input": "KV-118_A.mp4", "expected": "KV-118#part1" },
{ "name": "multipart trailing -N before bracket", "input": "OFJE-195-7 [480p].mp4", "expected": "OFJE-195#part7" },
{ "name": "FC2 PPV plain", "input": "FC2-1841460.mp4", "expected": "FC2-PPV-1841460" },
{ "name": "FC2 PPV explicit", "input": "FC2-PPV-1841460.mp4", "expected": "FC2-PPV-1841460" },
{ "name": "no ID present", "input": "random_video.mp4", "expected": null }
]
}
+22
View File
@@ -0,0 +1,22 @@
{
"version": 1,
"domain": "query",
"description": "Page text / title -> canonical JAV ID. Consumed by the browser extension (content.js normalizeId). Difference from filename: looser context (sentences, mixed punctuation, site chrome). Includes forms (e.g. FC2PPV compact) that Python extract_id does NOT handle, by design — see fixtures/README.md.",
"case_schema": {
"name": "human label",
"input": "raw page text",
"expected": "canonical ID without part suffix (extension never emits #partN), or null when no ID found"
},
"cases": [
{ "name": "title with site chrome", "input": "SSIS-001 — JAV.tube", "expected": "SSIS-001" },
{ "name": "title with description", "input": "Watch SSIS-001 1080p HD Online", "expected": "SSIS-001" },
{ "name": "trailing letter variant", "input": "IBW-902z Full Movie", "expected": "IBW-902" },
{ "name": "no hyphen in title", "input": "MVSD312 stream", "expected": "MVSD-312" },
{ "name": "FC2 PPV compact", "input": "FC2PPV-1841460 — preview", "expected": "FC2-PPV-1841460" },
{ "name": "FC2 plain digits", "input": "FC2-1841460 thumbnail", "expected": "FC2-PPV-1841460" },
{ "name": "FC2-PPV explicit", "input": "FC2-PPV-1841460 Full", "expected": "FC2-PPV-1841460" },
{ "name": "leading zeros preserved", "input": "ABF-042 — sample", "expected": "ABF-042" },
{ "name": "long numeric tail (7 digits)", "input": "BLK-4748520 stream", "expected": "BLK-4748520" },
{ "name": "no ID present", "input": "JAV Database · home", "expected": null }
]
}
+70
View File
@@ -0,0 +1,70 @@
"""Run the shared JAV-ID fixture corpus against rc-jav.py.
Exits non-zero if any fixture case fails. No third-party dependencies.
Usage:
python fixtures/run.py
"""
from __future__ import annotations
import importlib.util
import json
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
FIXTURES = Path(__file__).resolve().parent
SPEC = importlib.util.spec_from_file_location("rcjav", ROOT / "rc-jav.py")
RCJAV = importlib.util.module_from_spec(SPEC)
sys.modules[SPEC.name] = RCJAV
SPEC.loader.exec_module(RCJAV)
def _load(name: str) -> dict:
with (FIXTURES / name).open("r", encoding="utf-8") as f:
return json.load(f)
def _run(label: str, cases: list[dict], fn) -> tuple[int, int]:
passed = 0
failed = 0
for case in cases:
got = fn(case["input"])
if got == case["expected"]:
passed += 1
else:
failed += 1
print(f" FAIL [{label}] {case['name']!r}")
print(f" input = {case['input']!r}")
print(f" expected = {case['expected']!r}")
print(f" got = {got!r}")
return passed, failed
def main() -> int:
total_passed = 0
total_failed = 0
for filename, fn_name, fn in [
("filename-extraction.json", "extract_id", RCJAV.extract_id),
("shared-normalization.json", "normalize_id", RCJAV.normalize_id),
]:
doc = _load(filename)
cases = doc.get("cases", [])
print(f"\n{filename} -> rcjav.{fn_name} ({len(cases)} cases)")
p, f = _run(filename, cases, fn)
total_passed += p
total_failed += f
print(f" {p} passed | {f} failed")
print()
if total_failed:
print(f"FAILED: {total_failed} of {total_passed + total_failed} cases")
return 1
print(f"OK: all {total_passed} cases passed")
return 0
if __name__ == "__main__":
sys.exit(main())
+17
View File
@@ -0,0 +1,17 @@
{
"version": 1,
"domain": "shared",
"description": "Raw ID forms → canonical form. Both Python (normalize_id) and the extension (content.js normalizeId) MUST agree on these. Mismatch here is a contract bug.",
"case_schema": {
"name": "human label",
"input": "raw ID-bearing token (no path, no extension)",
"expected": "canonical ID"
},
"cases": [
{ "name": "lowercase prefix uppercased", "input": "abc-027", "expected": "ABC-027" },
{ "name": "FC2 plain -> FC2-PPV", "input": "FC2-1841460", "expected": "FC2-PPV-1841460" },
{ "name": "FC2-PPV explicit preserved", "input": "FC2-PPV-1841460", "expected": "FC2-PPV-1841460" },
{ "name": "leading zeros preserved", "input": "ABF-042", "expected": "ABF-042" },
{ "name": "5-digit numeric segment", "input": "SDDE-12345", "expected": "SDDE-12345" }
]
}