From e923ca48798e797d284185254101e3911cad5fb8 Mon Sep 17 00:00:00 2001 From: Michael Ding Date: Thu, 28 May 2026 12:20:58 +0800 Subject: [PATCH 1/5] feat: add hash-based image naming with git-style directory structure --- src/kbmate_cli/image_helper.py | 41 +++++++++++++++---- tests/test_image_helper.py | 72 +++++++++++++++++++++++++++++++++- 2 files changed, 104 insertions(+), 9 deletions(-) diff --git a/src/kbmate_cli/image_helper.py b/src/kbmate_cli/image_helper.py index d4d808d..a1078a4 100644 --- a/src/kbmate_cli/image_helper.py +++ b/src/kbmate_cli/image_helper.py @@ -1,3 +1,4 @@ +import hashlib import re import shutil from pathlib import Path @@ -10,7 +11,14 @@ def normalize_image_refs(markdown: str) -> str: return result -def extract_and_relink_images(markdown: str, src_dir: str, dst_dir: str) -> str: +def extract_and_relink_images( + markdown: str, + src_dir: str, + dst_dir: str, + *, + sequential: bool = False, + ref_subdir: str = "", +) -> str: src_path = Path(src_dir) dst_path = Path(dst_dir) dst_path.mkdir(parents=True, exist_ok=True) @@ -24,12 +32,29 @@ def extract_and_relink_images(markdown: str, src_dir: str, dst_dir: str) -> str: filename = Path(img_path).name src_file = next(src_path.rglob(filename), None) if src_file is not None and src_file.is_file(): - new_name = f"image-{counter:03d}{src_file.suffix}" - dst_file = dst_path / new_name - shutil.move(str(src_file), str(dst_file)) - old_ref = match.group(0) - new_ref = f"![](assets/{dst_path.name}/{new_name})" - result = result.replace(old_ref, new_ref, 1) - counter += 1 + if sequential: + new_name = f"image-{counter:03d}{src_file.suffix}" + dst_file = dst_path / new_name + shutil.move(str(src_file), str(dst_file)) + old_ref = match.group(0) + new_ref = f"![](assets/{dst_path.name}/{new_name})" + result = result.replace(old_ref, new_ref, 1) + counter += 1 + else: + content = src_file.read_bytes() + hash_hex = hashlib.sha256(content).hexdigest() + hash_prefix = hash_hex[:2] + hash_sub = hash_hex[2:4] + hash_dir = dst_path / hash_prefix / hash_sub + hash_dir.mkdir(parents=True, exist_ok=True) + dst_file = hash_dir / f"{hash_hex}{src_file.suffix}" + if not dst_file.exists(): + shutil.move(str(src_file), str(dst_file)) + else: + src_file.unlink() + old_ref = match.group(0) + ref_parts = [p for p in ("assets", ref_subdir, hash_prefix, hash_sub, f"{hash_hex}{src_file.suffix}") if p] + new_ref = f"![]({'/'.join(ref_parts)})" + result = result.replace(old_ref, new_ref, 1) return result diff --git a/tests/test_image_helper.py b/tests/test_image_helper.py index 840918d..0df0f3a 100644 --- a/tests/test_image_helper.py +++ b/tests/test_image_helper.py @@ -1,3 +1,5 @@ +import hashlib + import pytest from pathlib import Path from kbmate_cli.image_helper import normalize_image_refs, extract_and_relink_images @@ -29,10 +31,78 @@ def test_extract_and_relink_images(): (src_dir / "doc-0001-01.png").touch() (src_dir / "doc-0001-02.png").touch() - result = extract_and_relink_images(md_content, str(src_dir), str(dst_dir)) + result = extract_and_relink_images(md_content, str(src_dir), str(dst_dir), sequential=True) assert (dst_dir / "image-001.png").exists() assert (dst_dir / "image-002.png").exists() assert "![](assets/test_relink_dst/image-001.png)" in result assert "![](assets/test_relink_dst/image-002.png)" in result assert "doc-0001-01" not in result + + +def test_extract_and_relink_images_hash_naming(): + content1 = b"hello world image 1" + content2 = b"hello world image 2" + md_content = ( + "![](/tmp/src/doc-0001-01.png)\n" + "![](/tmp/src/doc-0001-02.png)" + ) + src_dir = Path("/tmp/test_hash_src") + dst_dir = Path("/tmp/test_hash_dst") + src_dir.mkdir(parents=True, exist_ok=True) + dst_dir.mkdir(parents=True, exist_ok=True) + (src_dir / "doc-0001-01.png").write_bytes(content1) + (src_dir / "doc-0001-02.png").write_bytes(content2) + + hash1 = hashlib.sha256(content1).hexdigest() + hash2 = hashlib.sha256(content2).hexdigest() + + result = extract_and_relink_images(md_content, str(src_dir), str(dst_dir), sequential=False) + + assert (dst_dir / hash1[:2] / hash1[2:4] / f"{hash1}.png").exists() + assert (dst_dir / hash2[:2] / hash2[2:4] / f"{hash2}.png").exists() + assert f"assets/{hash1[:2]}/{hash1[2:4]}/{hash1}.png" in result + assert f"assets/{hash2[:2]}/{hash2[2:4]}/{hash2}.png" in result + assert "image-001" not in result + + +def test_extract_and_relink_images_dedup(): + content = b"duplicate content" + md_content = ( + "![](/tmp/src_dedup/img-a.png)\n" + "![](/tmp/src_dedup/img-b.png)" + ) + src_dir = Path("/tmp/test_dedup_src") + dst_dir = Path("/tmp/test_dedup_dst") + src_dir.mkdir(parents=True, exist_ok=True) + dst_dir.mkdir(parents=True, exist_ok=True) + (src_dir / "img-a.png").write_bytes(content) + (src_dir / "img-b.png").write_bytes(content) + + hash_hex = hashlib.sha256(content).hexdigest() + + result = extract_and_relink_images(md_content, str(src_dir), str(dst_dir), sequential=False) + + hash_file = dst_dir / hash_hex[:2] / hash_hex[2:4] / f"{hash_hex}.png" + assert hash_file.exists() + expected_ref = f"assets/{hash_hex[:2]}/{hash_hex[2:4]}/{hash_hex}.png" + assert result.count(expected_ref) == 2 + + +def test_extract_and_relink_images_hash_ref_subdir(): + content = b"some content" + md_content = "![](/tmp/src_ref/img.png)" + src_dir = Path("/tmp/test_ref_src") + dst_dir = Path("/tmp/test_ref_dst") + src_dir.mkdir(parents=True, exist_ok=True) + dst_dir.mkdir(parents=True, exist_ok=True) + (src_dir / "img.png").write_bytes(content) + + hash_hex = hashlib.sha256(content).hexdigest() + result = extract_and_relink_images( + md_content, str(src_dir), str(dst_dir), + sequential=False, ref_subdir="subdir", + ) + + expected = f"assets/subdir/{hash_hex[:2]}/{hash_hex[2:4]}/{hash_hex}.png" + assert expected in result From 0849eacebbe00ee46b6caca93487477d09f750b7 Mon Sep 17 00:00:00 2001 From: Michael Ding Date: Thu, 28 May 2026 12:26:37 +0800 Subject: [PATCH 2/5] feat: add --assets-seqname flag, default to hash-based image naming --- src/kbmate_cli/main.py | 30 ++++++++++++++++++++---------- tests/test_cli.py | 36 ++++++++++++++++++++++++++++++++---- 2 files changed, 52 insertions(+), 14 deletions(-) diff --git a/src/kbmate_cli/main.py b/src/kbmate_cli/main.py index 6d74451..bb46b73 100644 --- a/src/kbmate_cli/main.py +++ b/src/kbmate_cli/main.py @@ -48,22 +48,22 @@ def _collect_files_from_list(filelist: Path) -> list[str]: return [line.strip() for line in lines if line.strip()] -def _convert_pdf(src: Path, assets_dir: Path) -> str: +def _convert_pdf(src: Path, src_dir: Path, dst_dir: Path, *, sequential: bool = False, ref_subdir: str = "") -> str: from kbmate_cli.pdf_converter import convert_pdf from kbmate_cli.image_helper import extract_and_relink_images - md = convert_pdf(str(src), str(assets_dir)) - return extract_and_relink_images(md, str(assets_dir), str(assets_dir)) + md = convert_pdf(str(src), str(src_dir)) + return extract_and_relink_images(md, str(src_dir), str(dst_dir), sequential=sequential, ref_subdir=ref_subdir) -def _convert_docx(src: Path, assets_dir: Path) -> str: +def _convert_docx(src: Path, src_dir: Path, dst_dir: Path, *, sequential: bool = False, ref_subdir: str = "") -> str: from kbmate_cli.docx_converter import convert_docx from kbmate_cli.image_helper import normalize_image_refs, extract_and_relink_images - pandoc_output = assets_dir / "pandoc_output" + pandoc_output = src_dir / "pandoc_output" md = convert_docx(str(src), str(pandoc_output)) md = normalize_image_refs(md) - md = extract_and_relink_images(md, str(pandoc_output), str(assets_dir)) + md = extract_and_relink_images(md, str(pandoc_output), str(dst_dir), sequential=sequential, ref_subdir=ref_subdir) if pandoc_output.exists(): import shutil shutil.rmtree(pandoc_output) @@ -82,6 +82,7 @@ def convert_single( *, layout: Literal["flat", "mirror"] = "flat", relative_to: Path | None = None, + assets_seqname: bool = False, ) -> None: ext = source_path.suffix.lower() converter = _CONVERTERS.get(ext) @@ -107,7 +108,14 @@ def convert_single( converts_dir.mkdir(parents=True, exist_ok=True) assets_dir.mkdir(parents=True, exist_ok=True) - markdown_content = converter(source_path, assets_dir) + if assets_seqname: + markdown_content = converter(source_path, assets_dir, assets_dir, sequential=True) + else: + hash_base = assets_parent / rel + hash_base.mkdir(parents=True, exist_ok=True) + ref_subdir = "" if rel == Path(".") else str(rel) + markdown_content = converter(source_path, assets_dir, hash_base, sequential=False, ref_subdir=ref_subdir) + md_path = converts_dir / f"{safe_stem}.md" md_path.write_text(markdown_content, encoding="utf-8") typer.echo(f"Converted: {source_path} -> {md_path}") @@ -117,6 +125,7 @@ def convert_single( def convert( source_file: str = typer.Argument(..., help="Path or URL to the .docx or .pdf file"), output_dir: str = typer.Option("raw", help="Output directory"), + assets_seqname: bool = typer.Option(False, "--assets-seqname", help="Use sequential naming for asset images instead of content hash"), ): try: source_file, temp_path = _resolve_source(source_file) @@ -130,7 +139,7 @@ def convert( raise typer.Exit(code=1) try: - convert_single(src, Path(output_dir)) + convert_single(src, Path(output_dir), assets_seqname=assets_seqname) except ValueError as e: typer.echo(f"Error: {e}", err=True) raise typer.Exit(code=1) @@ -145,6 +154,7 @@ def bulk_convert( file_list: str = typer.Option(None, "-f", "--file-list", help="File containing one source per line"), output_dir: str = typer.Option("raw", help="Output directory"), output_layout: str = typer.Option("flat", "--output-layout", help="Output layout: flat or mirror"), + assets_seqname: bool = typer.Option(False, "--assets-seqname", help="Use sequential naming for asset images instead of content hash"), ): if (recursive is not None) == (file_list is not None): typer.echo("Error: specify either -r (directory) or -f (file list), not both", err=True) @@ -163,7 +173,7 @@ def bulk_convert( raise typer.Exit(code=1) for src_path in _collect_files_from_dir(root): try: - convert_single(src_path, out, layout=output_layout, relative_to=root) + convert_single(src_path, out, layout=output_layout, relative_to=root, assets_seqname=assets_seqname) except Exception as e: typer.echo(f"Error converting {src_path}: {e}", err=True) else: @@ -192,7 +202,7 @@ def bulk_convert( print_cleanup_hint(temp_path) continue try: - convert_single(src, out, layout=output_layout) + convert_single(src, out, layout=output_layout, assets_seqname=assets_seqname) except Exception as e: typer.echo(f"Error converting {src}: {e}", err=True) finally: diff --git a/tests/test_cli.py b/tests/test_cli.py index 89c2b6b..8d753f0 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -36,15 +36,43 @@ def test_convert_lunwen_pdf(): assert result.exit_code == 0 +def test_convert_assets_seqname_flag_in_help(): + result = runner.invoke(app, ["convert", "--help"]) + assert result.exit_code == 0 + assert "--assets-seqname" in result.stdout + + +def test_bulk_convert_assets_seqname_flag_in_help(): + result = runner.invoke(app, ["bulk-convert", "--help"]) + assert result.exit_code == 0 + assert "--assets-seqname" in result.stdout + + +@patch.dict("kbmate_cli.main._CONVERTERS", {".pdf": MagicMock(return_value="# mock")}) +def test_bulk_convert_assets_seqname_flag(): + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + pdf = root / "a.pdf" + pdf.write_text("fake") + out = root / "out" + result = runner.invoke(app, [ + "bulk-convert", "-r", str(root), + "--output-dir", str(out), + "--assets-seqname", + ]) + assert result.exit_code == 0 + assets_dir = out / "assets" / "a" + assert assets_dir.exists() + + def test_convert_pdf_with_spaces_in_filename(): src = FIXTURE_DIR / "eigent README CN.pdf" out = Path("/tmp/test_cli_spaces_output") result = runner.invoke(app, ["convert", str(src), "--output-dir", str(out)]) assert result.exit_code == 0, f"Failed with output: {result.output}" - assets_dir = out / "assets" / "eigent_README_CN" - assert assets_dir.exists() - images = list(assets_dir.glob("*")) - assert len(images) > 0, f"No images found in {assets_dir}" + assert (out / "assets").exists() + hash_dirs = list((out / "assets").glob("*")) + assert len(hash_dirs) > 0, f"No hash dirs found in {out / 'assets'}" def test_convert_file_url(): From 0e71ada7c3d200eb927afa5cf1ba003667e680e0 Mon Sep 17 00:00:00 2001 From: Michael Ding Date: Thu, 28 May 2026 13:16:15 +0800 Subject: [PATCH 3/5] docs: add design spec and implementation plan for asset hash naming --- .../plans/2026-05-28-assets-hash-naming.md | 369 ++++++++++++++++++ .../2026-05-28-assets-hash-naming-design.md | 144 +++++++ 2 files changed, 513 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-28-assets-hash-naming.md create mode 100644 docs/superpowers/specs/2026-05-28-assets-hash-naming-design.md diff --git a/docs/superpowers/plans/2026-05-28-assets-hash-naming.md b/docs/superpowers/plans/2026-05-28-assets-hash-naming.md new file mode 100644 index 0000000..657f212 --- /dev/null +++ b/docs/superpowers/plans/2026-05-28-assets-hash-naming.md @@ -0,0 +1,369 @@ +# Assets Hash Naming Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add `--assets-seqname` flag to `convert`/`bulk-convert`; default image naming uses SHA256 content hash with git-style two-level directory. + +**Architecture:** `extract_and_relink_images()` in `image_helper.py` receives `sequential` and `ref_subdir` parameters. Hash mode computes SHA256, stores under `dst_dir/{hash[:2]}/{hash[2:4]}/{hash64}.ext`, deduplicates. Call chain: CLI → `convert_single` → `_convert_pdf`/`_convert_docx` → `extract_and_relink_images`. + +**Tech Stack:** Python 3.12+, `hashlib` (stdlib), `shutil` (stdlib), `typer` + +--- + +## File Structure + +| File | Responsibility | Change | +|------|---------------|--------| +| `src/kbmate_cli/image_helper.py` | Hash computation, file move, ref rewrite | Add `sequential`/`ref_subdir` params, hash naming path | +| `src/kbmate_cli/main.py` | CLI commands, orchestration | Add `--assets-seqname` option, propagate through call chain | +| `tests/test_image_helper.py` | Test hash naming logic | Add hash naming + dedup + ref_subdir tests | +| `tests/test_cli.py` | CLI integration tests | Add `--assets-seqname` integration test | + +--- + +### Task 1: Update `extract_and_relink_images` with hash naming + +**Files:** +- Modify: `src/kbmate_cli/image_helper.py` +- Test: `tests/test_image_helper.py` + +- [ ] **Step 1: Write failing test for hash naming** + +```python +import hashlib +from kbmate_cli.image_helper import extract_and_relink_images + +def test_extract_and_relink_images_hash_naming(): + content1 = b"hello world image 1" + content2 = b"hello world image 2" + md_content = ( + "![](/tmp/src/doc-0001-01.png)\n" + "![](/tmp/src/doc-0001-02.png)" + ) + src_dir = Path("/tmp/test_hash_src") + dst_dir = Path("/tmp/test_hash_dst") + src_dir.mkdir(parents=True, exist_ok=True) + dst_dir.mkdir(parents=True, exist_ok=True) + (src_dir / "doc-0001-01.png").write_bytes(content1) + (src_dir / "doc-0001-02.png").write_bytes(content2) + + hash1 = hashlib.sha256(content1).hexdigest() + hash2 = hashlib.sha256(content2).hexdigest() + + result = extract_and_relink_images(md_content, str(src_dir), str(dst_dir), sequential=False) + + assert (dst_dir / hash1[:2] / hash1[2:4] / f"{hash1}.png").exists() + assert (dst_dir / hash2[:2] / hash2[2:4] / f"{hash2}.png").exists() + assert f"assets/{hash1[:2]}/{hash1[2:4]}/{hash1}.png" in result + assert f"assets/{hash2[:2]}/{hash2[2:4]}/{hash2}.png" in result + assert "image-001" not in result +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `uv run pytest tests/test_image_helper.py::test_extract_and_relink_images_hash_naming -v` +Expected: FAIL — `extract_and_relink_images` doesn't accept `sequential` keyword + +- [ ] **Step 3: Add `sequential` and `ref_subdir` params to `extract_and_relink_images`** + +```python +import hashlib + +def extract_and_relink_images( + markdown: str, + src_dir: str, + dst_dir: str, + *, + sequential: bool = False, + ref_subdir: str = "", +) -> str: + src_path = Path(src_dir) + dst_path = Path(dst_dir) + dst_path.mkdir(parents=True, exist_ok=True) + + img_pattern = re.compile(r'!\[.*?\]\(([^)]+)\)') + counter = 1 + result = markdown + + for match in img_pattern.finditer(markdown): + img_path = match.group(1) + filename = Path(img_path).name + src_file = next(src_path.rglob(filename), None) + if src_file is not None and src_file.is_file(): + if sequential: + new_name = f"image-{counter:03d}{src_file.suffix}" + dst_file = dst_path / new_name + shutil.move(str(src_file), str(dst_file)) + old_ref = match.group(0) + new_ref = f"![](assets/{dst_path.name}/{new_name})" + result = result.replace(old_ref, new_ref, 1) + counter += 1 + else: + content = src_file.read_bytes() + hash_hex = hashlib.sha256(content).hexdigest() + hash_prefix = hash_hex[:2] + hash_sub = hash_hex[2:4] + hash_dir = dst_path / hash_prefix / hash_sub + hash_dir.mkdir(parents=True, exist_ok=True) + dst_file = hash_dir / f"{hash_hex}{src_file.suffix}" + if not dst_file.exists(): + shutil.move(str(src_file), str(dst_file)) + else: + src_file.unlink() + old_ref = match.group(0) + ref_parts = [p for p in ("assets", ref_subdir, hash_prefix, hash_sub, f"{hash_hex}{src_file.suffix}") if p] + new_ref = f"![]({'/'.join(ref_parts)})" + result = result.replace(old_ref, new_ref, 1) + + return result +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `uv run pytest tests/test_image_helper.py::test_extract_and_relink_images_hash_naming -v` +Expected: PASS + +- [ ] **Step 5: Write test for deduplication** + +```python +def test_extract_and_relink_images_dedup(): + content = b"duplicate content" + md_content = ( + "![](/tmp/src_dedup/img-a.png)\n" + "![](/tmp/src_dedup/img-b.png)" + ) + src_dir = Path("/tmp/test_dedup_src") + dst_dir = Path("/tmp/test_dedup_dst") + src_dir.mkdir(parents=True, exist_ok=True) + dst_dir.mkdir(parents=True, exist_ok=True) + (src_dir / "img-a.png").write_bytes(content) + (src_dir / "img-b.png").write_bytes(content) + + hash_hex = hashlib.sha256(content).hexdigest() + + result = extract_and_relink_images(md_content, str(src_dir), str(dst_dir), sequential=False) + + # Only one file on disk + hash_file = dst_dir / hash_hex[:2] / hash_hex[2:4] / f"{hash_hex}.png" + assert hash_file.exists() + # Both refs point to the same hash path + expected_ref = f"assets/{hash_hex[:2]}/{hash_hex[2:4]}/{hash_hex}.png" + assert result.count(expected_ref) == 2 +``` + +- [ ] **Step 6: Run dedup test** + +Run: `uv run pytest tests/test_image_helper.py::test_extract_and_relink_images_dedup -v` +Expected: PASS + +- [ ] **Step 7: Write test for `ref_subdir` in hash mode** + +```python +def test_extract_and_relink_images_hash_ref_subdir(): + content = b"some content" + md_content = "![](/tmp/src_ref/img.png)" + src_dir = Path("/tmp/test_ref_src") + dst_dir = Path("/tmp/test_ref_dst") + src_dir.mkdir(parents=True, exist_ok=True) + dst_dir.mkdir(parents=True, exist_ok=True) + (src_dir / "img.png").write_bytes(content) + + hash_hex = hashlib.sha256(content).hexdigest() + result = extract_and_relink_images( + md_content, str(src_dir), str(dst_dir), + sequential=False, ref_subdir="subdir", + ) + + expected = f"assets/subdir/{hash_hex[:2]}/{hash_hex[2:4]}/{hash_hex}.png" + assert expected in result +``` + +- [ ] **Step 8: Run ref_subdir test** + +Run: `uv run pytest tests/test_image_helper.py::test_extract_and_relink_images_hash_ref_subdir -v` +Expected: PASS + +- [ ] **Step 9: Run all image helper tests** + +Run: `uv run pytest tests/test_image_helper.py -v` +Expected: All 5 tests pass (3 new + 2 existing) + +- [ ] **Step 10: Commit** + +```bash +git add src/kbmate_cli/image_helper.py tests/test_image_helper.py +git commit -m "feat: add hash-based image naming with git-style directory structure" +``` + +--- + +### Task 2: Wire `--assets-seqname` through CLI chain + +**Files:** +- Modify: `src/kbmate_cli/main.py` +- Test: `tests/test_cli.py` + +- [ ] **Step 1: Write failing CLI test for `--assets-seqname`** + +```python +def test_convert_assets_seqname_flag_in_help(): + result = runner.invoke(app, ["convert", "--help"]) + assert result.exit_code == 0 + assert "--assets-seqname" in result.stdout +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `uv run pytest tests/test_cli.py::test_convert_assets_seqname_flag_in_help -v` +Expected: FAIL — `--assets-seqname` not yet added + +- [ ] **Step 3: Add `assets_seqname` parameter to `convert` and `bulk_convert`** + +```python +@app.command() +def convert( + source_file: str = typer.Argument(..., help="Path or URL to the .docx or .pdf file"), + output_dir: str = typer.Option("raw", help="Output directory"), + assets_seqname: bool = typer.Option(False, "--assets-seqname", help="Use sequential naming for asset images instead of content hash"), +): + ... + try: + convert_single(src, Path(output_dir), assets_seqname=assets_seqname) + ... +``` + +```python +@app.command() +def bulk_convert( + recursive: str = typer.Option(None, "-r", "--recursive", help="Directory to scan recursively"), + file_list: str = typer.Option(None, "-f", "--file-list", help="File containing one source per line"), + output_dir: str = typer.Option("raw", help="Output directory"), + output_layout: str = typer.Option("flat", "--output-layout", help="Output layout: flat or mirror"), + assets_seqname: bool = typer.Option(False, "--assets-seqname", help="Use sequential naming for asset images instead of content hash"), +): + ... + for src_path in _collect_files_from_dir(root): + try: + convert_single(src_path, out, layout=output_layout, relative_to=root, assets_seqname=assets_seqname) + ... + for line in lines: + try: + convert_single(src, out, layout=output_layout, assets_seqname=assets_seqname) + ... +``` + +- [ ] **Step 4: Update `convert_single` signature and implementation** + +```python +def convert_single( + source_path: Path, + output_dir: Path, + *, + layout: Literal["flat", "mirror"] = "flat", + relative_to: Path | None = None, + assets_seqname: bool = False, +) -> None: + ... + assets_dir = assets_parent / rel / safe_stem + converts_dir = output_dir / "converts" / rel + converts_dir.mkdir(parents=True, exist_ok=True) + assets_dir.mkdir(parents=True, exist_ok=True) + + if assets_seqname: + markdown_content = converter(source_path, assets_dir, assets_dir, sequential=True) + else: + hash_base = assets_parent / rel + hash_base.mkdir(parents=True, exist_ok=True) + ref_subdir = "" if rel == Path(".") else str(rel) + markdown_content = converter(source_path, assets_dir, hash_base, sequential=False, ref_subdir=ref_subdir) + ... +``` + +- [ ] **Step 5: Update `_convert_pdf` and `_convert_docx` signatures** + +```python +def _convert_pdf(src: Path, src_dir: Path, dst_dir: Path, *, sequential: bool = False, ref_subdir: str = "") -> str: + from kbmate_cli.pdf_converter import convert_pdf + from kbmate_cli.image_helper import extract_and_relink_images + md = convert_pdf(str(src), str(src_dir)) + return extract_and_relink_images(md, str(src_dir), str(dst_dir), sequential=sequential, ref_subdir=ref_subdir) + + +def _convert_docx(src: Path, src_dir: Path, dst_dir: Path, *, sequential: bool = False, ref_subdir: str = "") -> str: + from kbmate_cli.docx_converter import convert_docx + from kbmate_cli.image_helper import normalize_image_refs, extract_and_relink_images + pandoc_output = src_dir / "pandoc_output" + md = convert_docx(str(src), str(pandoc_output)) + md = normalize_image_refs(md) + md = extract_and_relink_images(md, str(pandoc_output), str(dst_dir), sequential=sequential, ref_subdir=ref_subdir) + if pandoc_output.exists(): + import shutil + shutil.rmtree(pandoc_output) + return md +``` + +- [ ] **Step 6: Run test to verify it passes** + +Run: `uv run pytest tests/test_cli.py::test_convert_assets_seqname_flag_in_help -v` +Expected: PASS + +- [ ] **Step 7: Add integration test for non-sequential (hash) default** + +```python +@patch.dict("kbmate_cli.main._CONVERTERS", {".pdf": MagicMock(return_value="# mock")}) +def test_bulk_convert_assets_seqname_flag(): + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + pdf = root / "a.pdf" + pdf.write_text("fake") + out = root / "out" + result = runner.invoke(app, [ + "bulk-convert", "-r", str(root), + "--output-dir", str(out), + "--assets-seqname", + ]) + assert result.exit_code == 0 + + # Sequential mode should have per-doc assets subdir + assets_dir = out / "assets" / "a" + assert assets_dir.exists() +``` + +- [ ] **Step 8: Run integration test** + +Run: `uv run pytest tests/test_cli.py::test_bulk_convert_assets_seqname_flag -v` +Expected: PASS + +- [ ] **Step 9: Update `test_convert_pdf_with_spaces_in_filename` to pass with hash naming** + +The existing test checks for per-doc subdirectory `assets/eigent_README_CN`. With default hash naming, images no longer go to per-doc subdirs. Update the test: + +```python +def test_convert_pdf_with_spaces_in_filename(): + src = FIXTURE_DIR / "eigent README CN.pdf" + out = Path("/tmp/test_cli_spaces_output") + result = runner.invoke(app, ["convert", str(src), "--output-dir", str(out)]) + assert result.exit_code == 0, f"Failed with output: {result.output}" + # In default hash mode, images are under hash subdirs inside assets/ + assert (out / "assets").exists() + # At least one hash directory level exists + hash_dirs = list((out / "assets").glob("*")) + assert len(hash_dirs) > 0, f"No hash dirs found in {out / 'assets'}" +``` + +- [ ] **Step 10: Run all existing tests** + +Run: `uv run pytest tests/test_cli.py tests/test_image_helper.py -v` +Expected: All tests pass. (If any fail, fix them.) + +- [ ] **Step 11: Run full test suite** + +Run: `uv run pytest -v` +Expected: All tests pass, coverage >= 85% + +- [ ] **Step 12: Commit** + +```bash +git add src/kbmate_cli/main.py src/kbmate_cli/image_helper.py tests/ +git commit -m "feat: add --assets-seqname flag, default to hash-based image naming" +``` diff --git a/docs/superpowers/specs/2026-05-28-assets-hash-naming-design.md b/docs/superpowers/specs/2026-05-28-assets-hash-naming-design.md new file mode 100644 index 0000000..773640f --- /dev/null +++ b/docs/superpowers/specs/2026-05-28-assets-hash-naming-design.md @@ -0,0 +1,144 @@ +# Assets Hash Naming Design + +> **Goal:** Change the default image naming strategy in kbmate-cli from sequential (`image-001.png`) to content-addressed (SHA256 hash), with a `--assets-seqname` flag to opt into the old sequential behavior. + +**Architecture:** The change is contained in `image_helper.py`'s `extract_and_relink_images()`, which receives a new `sequential` boolean parameter. The hash-naming path computes SHA256 of file content, creates a two-level directory structure (`hash[:2]/hash[2:4]/fullhash.ext`), and moves the file accordingly. Markdown references are updated to match the new paths. CLI entry points (`convert`, `bulk_convert`) expose a `--assets-seqname` flag that propagates down to the converter. + +**Tech Stack:** Python 3.12+, `hashlib` (stdlib), `shutil` (stdlib) + +--- + +## Default behavior (hash naming) + +When `--assets-seqname` is NOT set: + +1. For each extracted image, read its content and compute SHA256 digest (full 64-character hex string) +2. Create a two-level subdirectory under the assets directory: `hash[:2] / hash[2:4] /` +3. Move the image file to `hash[:2] / hash[2:4] / fullhash.suffix` +4. If a file with the same hash already exists at the target path, skip moving (deduplication) +5. Update the markdown image reference to point to the correct relative path: `![](assets/hash[:2]/hash[2:4]/fullhash.suffix)` + +### Example + +Given an image `doc-0001-01.png` with SHA256 `a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6`: + +- Location: `assets/a1/b2/a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6.png` +- Markdown reference: `![](assets/a1/b2/a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6.png)` + +### Deduplication + +If two images in the source have identical content, they will resolve to the same hash. The second occurrence will detect the target file already exists and skip the `shutil.move`, but still update the markdown reference to point to the same path. + +### Directory structure change from current + +Current (sequential): +``` +assets// +├── image-001.png +├── image-002.png +``` + +New (hash, default): +``` +assets/ +├── a1/ +│ └── b2/ +│ └── a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6.png +└── ff/ + └── ee/ + └── ffee...png +``` + +> **Note:** The per-document subdirectory (`/`) is removed in hash mode — all images share the flat two-level hash structure under `assets/`. + +--- + +## `--assets-seqname` mode (sequential, current behavior) + +When `--assets-seqname` is set: + +1. Uses the current `image-001.png`, `image-002.png`, ... naming +2. Images are placed in `assets///` (unchanged from current) +3. Behaves identically to today's `extract_and_relink_images()` + +--- + +## CLI changes + +### `convert` command + +```python +def convert( + source_file: str = typer.Argument(...), + output_dir: str = typer.Option("raw"), + assets_seqname: bool = typer.Option(False, "--assets-seqname", help="Use sequential naming for asset images"), +): +``` + +### `bulk_convert` command + +```python +def bulk_convert( + recursive: str = typer.Option(None, "-r", "--recursive"), + file_list: str = typer.Option(None, "-f", "--file-list"), + output_dir: str = typer.Option("raw"), + output_layout: str = typer.Option("flat", "--output-layout"), + assets_seqname: bool = typer.Option(False, "--assets-seqname", help="Use sequential naming for asset images"), +): +``` + +### Propagation path + +``` +convert / bulk_convert + └─ assets_seqname (CLI parameter) + └─ convert_single(assets_seqname=...) + └─ _convert_pdf(src, assets_dir, assets_seqname) + └─ extract_and_relink_images(md, src_dir, dst_dir, sequential=...) + └─ _convert_docx(src, assets_dir, assets_seqname) + └─ extract_and_relink_images(md, src_dir, dst_dir, sequential=...) +``` + +--- + +## Files to modify + +| File | Change | +|------|--------| +| `src/kbmate_cli/main.py` | Add `assets_seqname` to `convert`, `bulk_convert`, `convert_single`, `_convert_pdf`, `_convert_docx` | +| `src/kbmate_cli/image_helper.py` | Add `sequential: bool = False` to `extract_and_relink_images()`; implement hash-naming path | +| `tests/test_image_helper.py` | Add test cases for hash naming with directory structure | +| `tests/test_cli.py` | Add CLI integration tests (if applicable) | + +--- + +## `image_helper.py` — new `extract_and_relink_images` signature + +```python +def extract_and_relink_images( + markdown: str, + src_dir: str, + dst_dir: str, + *, + sequential: bool = False, +) -> str: +``` + +When `sequential=True`: +- Use current counter-based naming (`image-001.png`) + +When `sequential=False` (default): +- For each image, compute SHA256 of file contents +- Construct path: `dst_dir / hash[:2] / hash[2:4] / hash{suffix}` +- Handle deduplication (skip move if target exists) +- Update markdown references accordingly + +--- + +## Testing + +- Test hash naming produces correct directory structure: `hash[:2] / hash[2:4] / fullhash.png` +- Test deduplication: two identical images → single file on disk, correct references +- Test `--assets-seqname` flag restores sequential behavior (existing tests still pass) +- Test hash collision handling +- Test markdown refs are correctly rewritten From 36eaed0b3808614f17ebd166bea4dd7014e1a6f1 Mon Sep 17 00:00:00 2001 From: Michael Ding Date: Thu, 28 May 2026 13:16:30 +0800 Subject: [PATCH 4/5] chore: bump version to 0.4.0 --- pyproject.toml | 2 +- uv.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 85774b1..5497f1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "kbmate-cli" -version = "0.3.0" +version = "0.4.0" description = "CLI tools for knowledge base management" readme = "README.md" requires-python = ">=3.12" diff --git a/uv.lock b/uv.lock index 19855c5..299a6fd 100644 --- a/uv.lock +++ b/uv.lock @@ -135,7 +135,7 @@ wheels = [ [[package]] name = "kbmate-cli" -version = "0.3.0" +version = "0.4.0" source = { editable = "." } dependencies = [ { name = "pymupdf4llm" }, From 147f9121102e94afa9a3efa8245fd30593395114 Mon Sep 17 00:00:00 2001 From: Michael Ding Date: Thu, 28 May 2026 13:43:11 +0800 Subject: [PATCH 5/5] fix: rm valueless test case --- tests/test_cli.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 8d753f0..dfda1a3 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -36,18 +36,6 @@ def test_convert_lunwen_pdf(): assert result.exit_code == 0 -def test_convert_assets_seqname_flag_in_help(): - result = runner.invoke(app, ["convert", "--help"]) - assert result.exit_code == 0 - assert "--assets-seqname" in result.stdout - - -def test_bulk_convert_assets_seqname_flag_in_help(): - result = runner.invoke(app, ["bulk-convert", "--help"]) - assert result.exit_code == 0 - assert "--assets-seqname" in result.stdout - - @patch.dict("kbmate_cli.main._CONVERTERS", {".pdf": MagicMock(return_value="# mock")}) def test_bulk_convert_assets_seqname_flag(): with tempfile.TemporaryDirectory() as tmp: