nash-dir · nash-dir · Jun 22, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 22, 2026
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -23,6 +23,10 @@ jobs:
   # ─────────────────────────────────────────────
   publish-vsce:
     name: Publish VSIX (${{ matrix.target }})
+    # Job-level so the Open VSX step's `if` can read it — a step's own step-level
+    # env is NOT visible to that same step's `if` condition.
+    env:
+      OVSX_PAT: ${{ secrets.OVSX_PAT }}
     strategy:
       fail-fast: false
       matrix:
@@ -46,6 +50,21 @@ jobs:
         with:
           node-version: "20"
 
+      # Fail fast if the release tag and the packaged version disagree, so we
+      # never publish a VSIX whose version differs from the tag people see.
+      - name: Verify version matches release tag
+        if: github.event_name == 'release'
+        shell: bash
+        run: |
+          TAG="${{ github.event.release.tag_name }}"
+          VER="${TAG#v}"
+          PKG=$(node -p "require('./vscode-extension/package.json').version")
+          echo "tag=$TAG  expected=$VER  package.json=$PKG"
+          if [ "$VER" != "$PKG" ]; then
+            echo "::error::package.json version ($PKG) does not match release tag ($VER)"
+            exit 1
+          fi
+
       - name: Install Extension dependencies
         working-directory: vscode-extension
         run: npm install
@@ -99,21 +118,55 @@ jobs:
         with:
           files: vscode-extension/*.vsix
 
+      # No continue-on-error: a failed Marketplace publish (e.g. an expired
+      # VSCE_PAT) MUST turn the job red. The retry loop now records success and
+      # exits non-zero when all attempts fail, instead of silently ending on the
+      # trailing Start-Sleep.
       - name: Publish to VSCode Marketplace
         if: github.event_name == 'release'
         working-directory: vscode-extension
         timeout-minutes: 10
-        continue-on-error: true
         shell: pwsh
         run: |
           $vsix = (Get-ChildItem *.vsix | Select-Object -First 1).Name
-          Write-Output "Publishing $vsix..."
+          Write-Output "Publishing $vsix to VS Code Marketplace..."
+          $published = $false
           for ($i = 1; $i -le 3; $i++) {
             Write-Output "Attempt $i/3..."
             vsce publish --packagePath $vsix -p ${{ secrets.VSCE_PAT }}
-            if ($LASTEXITCODE -eq 0) { break }
-            Write-Output "Attempt $i failed, waiting 30s..."
-            Start-Sleep -Seconds 30
+            if ($LASTEXITCODE -eq 0) { $published = $true; break }
+            Write-Output "Attempt $i failed (exit $LASTEXITCODE)."
+            if ($i -lt 3) { Write-Output "Waiting 30s..."; Start-Sleep -Seconds 30 }
+          }
+          if (-not $published) {
+            Write-Error "vsce publish failed after 3 attempts. Is VSCE_PAT valid and unexpired?"
+            exit 1
+          }
+
+      # Open VSX (for Cursor / VSCodium / Windsurf / Eclipse Theia users).
+      # Opt-in: skipped unless an OVSX_PAT secret is configured, so it never
+      # blocks a release before the Open VSX namespace/token is set up.
+      # Prerequisite (one-time): `ovsx create-namespace nash-dir -p <OVSX_PAT>`.
+      - name: Publish to Open VSX
+        if: github.event_name == 'release' && env.OVSX_PAT != ''
+        working-directory: vscode-extension
+        timeout-minutes: 10
+        shell: pwsh
+        run: |
+          npm install -g ovsx
+          $vsix = (Get-ChildItem *.vsix | Select-Object -First 1).Name
+          Write-Output "Publishing $vsix to Open VSX..."
+          $published = $false
+          for ($i = 1; $i -le 3; $i++) {
+            Write-Output "Attempt $i/3..."
+            ovsx publish $vsix -p $env:OVSX_PAT
+            if ($LASTEXITCODE -eq 0) { $published = $true; break }
+            Write-Output "Attempt $i failed (exit $LASTEXITCODE)."
+            if ($i -lt 3) { Write-Output "Waiting 30s..."; Start-Sleep -Seconds 30 }
+          }
+          if (-not $published) {
+            Write-Error "ovsx publish failed after 3 attempts. Is OVSX_PAT valid and the namespace created?"
+            exit 1
           }
 
   # ─────────────────────────────────────────────
@@ -132,6 +185,19 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
+      - name: Verify version matches release tag
+        if: github.event_name == 'release'
+        shell: bash
+        run: |
+          TAG="${{ github.event.release.tag_name }}"
+          VER="${TAG#v}"
+          PYP=$(grep -m1 '^version' pyproject.toml | sed -E 's/.*"([^"]+)".*/\1/')
+          echo "tag=$TAG  expected=$VER  pyproject=$PYP"
+          if [ "$VER" != "$PYP" ]; then
+            echo "::error::pyproject.toml version ($PYP) does not match release tag ($VER)"
+            exit 1
+          fi
+
       - name: Set up Python
         uses: actions/setup-python@v5
         with:

diff --git a/.gitignore b/.gitignore
@@ -33,12 +33,20 @@ vscode-extension/out/
 vscode-extension/bin/
 vscode-extension/*.vsix
 
-# Example — track only *.py and benchmark/*.md
+# Example — track only *.py (root helpers) and benchmark/*.md
 # Downloaded data is gitignored (re-download via: python example/download_examples.py)
 example/*/
 !example/benchmark/
 example/benchmark/*
 !example/benchmark/*.md
+# Generated CLI report outputs left in example/ root
+# (timestamped, e.g. left_right_20260622_2035.{csv,html,md,pdf,json};
+#  *.sig and manifest.sha256.json are already covered by global rules above)
+example/*.csv
+example/*.html
+example/*.md
+example/*.pdf
+example/*.json
 
 # AI Agent
 .agent

diff --git a/README.md b/README.md
@@ -93,7 +93,7 @@ Diffinite runs a two-stage pipeline:
 ### Stage 1: 1:1 File Matching (`simple` mode)
 
 1. **Fuzzy name matching** — Pairs files across `dir_a` and `dir_b` using [RapidFuzz](https://github.com/rapidfuzz/RapidFuzz) string similarity (configurable threshold).
-2. **Comment stripping** — Optionally removes comments using a 6-state finite state machine parser supporting 30+ file extensions.
+2. **Comment stripping** — Optionally removes comments using a 6-state finite state machine parser supporting 45+ file extensions.
 3. **Side-by-side diff** — Computes line-by-line (or word-by-word) diffs using Python's `difflib.SequenceMatcher` with `autojunk=True`, a heuristic that drops high-frequency lines to speed up matching on large files (`SequenceMatcher` itself remains worst-case quadratic).
 4. **Report generation** — Renders syntax-highlighted HTML diffs via Pygments, then converts to PDF with xhtml2pdf.
 
@@ -181,12 +181,25 @@ dir_b    Path to the comparison source directory (B)
 | `--report-json PATH` | Generate machine-readable JSON report (used by VS Code extension) |
 | `--no-merge` | Generate individual PDFs per file instead of one merged PDF |
 | `--preserve-tree` / `--no-preserve-tree` | Preserve directory tree structure in individual output (default: on) |
+| `--sort-by {filename,path,similarity,ratio}` | Sort matched pairs in the report. Default: insertion order (no sort). |
+| `--sort-order {asc,desc}` | Sort direction (default: `asc`). Only effective with `--sort-by`. |
+
+### PDF Font / CJK Rendering
+
+Korean, Japanese, and Chinese text is rendered correctly in PDF output. By default the built-in xhtml2pdf CJK font (`HYGothic-Medium`) is used as a fallback; for a specific typeface, supply one of the options below. HTML output relies on the browser's native font fallback and needs no configuration.
+
+| Option | Description |
+|--------|-------------|
+| `--pdf-lang {ko,ja,zh-cn,…}` | Auto-resolve the best OS-installed font for the given language from the built-in `pdf_fonts.json` map (Windows/macOS/Linux paths). |
+| `--pdf-font PATH` | Absolute path to a `.ttf`/`.otf` font to embed via `@font-face` as the primary typeface. Takes precedence over `--pdf-lang`. |
+
+> Extend or override language→font mappings by creating `~/.diffinite_fonts.json` (same schema as the built-in map). User entries are merged over the built-ins per language.
 
 ### Diff Options
 
 | Option | Default | Description |
 |--------|:-------:|-------------|
-| `--strip-comments` | off | Strip comments before comparison (6-state FSM parser, 30+ extensions) |
+| `--strip-comments` | off | Strip comments before comparison (6-state FSM parser, 45+ extensions) |
 | `--by-word` | off | Compare by word instead of by line |
 | `--squash-blanks` | off | Collapse runs of 3+ blank lines. ⚠️ Changes line numbers — not recommended for forensic line-tracing. |
 | `--threshold N` | `60` | Fuzzy file-name matching threshold (0–100). Lower = more aggressive matching. |
@@ -213,6 +226,19 @@ dir_b    Path to the comparison source directory (B)
 | `--max-file-size N` | `10.0` | Files larger than this (MB) bypass the in-memory text decode and fall back to a SHA-256 hash comparison (reported as match/no-match rather than a line diff). Prevents OOM/CPU lock on large binary/generated files. |
 | `--hash` | off | Embed SHA-256 evidence integrity hashes for all analyzed files in the report. |
 | `--uncompared-files {inline,separate,none}` | `inline` | Control how unmatched files are displayed: inline in the main report, written to a separate `*_uncompared.txt` file, or omitted. |
+| `--bundle PATH` | — | Create an evidence bundle ZIP at `PATH` containing source files, generated reports, and the integrity manifest. |
+| `--dir-alias-a TEXT` / `--dir-alias-b TEXT` | — | Display alias for directory A/B in reports (avoids exposing absolute paths). |
+
+### Filtering & Advanced Options
+
+| Option | Default | Description |
+|--------|:-------:|-------------|
+| `--ignore-file PATH` | — | Path to a `.diffignore` file with glob patterns (e.g. `node_modules`, `*.pyc`) to exclude from analysis. |
+| `--binary-handling {exclude,hash,error}` | `hash` | How to handle non-decodable (binary) files: skip, show SHA-256 match status, or report a decode error. |
+| `--max-diff-html-size N` | `2.0` | Max HTML diff size (MB) before truncation. Prevents xhtml2pdf OOM/`RecursionError` on huge diffs. |
+| `--metrics-only` | off | Phase 1 only: compute similarities and emit JSON, skipping HTML/PDF rendering. |
+| `--filter-json PATH` | — | Phase 2: restrict output to the file-A paths listed in a JSON array (pairs with `--metrics-only`). |
+| `--unreadable-log PATH` | — | Write the list of files that could not be read (permission errors) to `PATH`. |
 
 ### Page Annotation Options
 
@@ -291,20 +317,20 @@ diffinite dir_a/ dir_b/ --threshold 80
 
 ## Comment Stripping Support
 
-The `--strip-comments` flag removes comments using a 6-state finite state machine parser:
+The `--strip-comments` flag removes comments using a 6-state finite state machine parser covering 45+ file extensions:
 
 | Extensions | Comment Styles |
 |------------|---------------|
-| `.py` | `# line comments` |
-| `.js`, `.ts`, `.jsx`, `.tsx` | `// line`, `/* block */` |
-| `.java`, `.c`, `.cpp`, `.h`, `.cs`, `.go`, `.rs`, `.kt`, `.scala` | `// line`, `/* block */` |
-| `.html`, `.xml`, `.svg`, `.htm` | `<!-- block -->` |
+| `.py`, `.pyw` | `# line` (docstrings preserved) |
+| `.js`, `.jsx`, `.mjs`, `.ts`, `.tsx` | `// line`, `/* block */`, template literals, regex literals |
+| `.java`, `.kt`, `.kts`, `.scala`, `.c`, `.cc`, `.cpp`, `.h`, `.hpp`, `.cs`, `.go`, `.rs`, `.swift` | `// line`, `/* block */` |
+| `.html`, `.htm`, `.xml`, `.svg` | `<!-- block -->` |
 | `.css`, `.scss`, `.less` | `/* block */` |
-| `.sql` | `-- line`, `/* block */` |
-| `.rb` | `# line` |
-| `.sh`, `.bash`, `.zsh` | `# line` |
+| `.sql`, `.ddl`, `.dml`, `.pks`, `.pkb`, `.plsql`, `.tsql` | `-- line`, `/* block */` |
+| `.php` | `// line`, `# line`, `/* block */` |
+| `.rb` | `# line`, `=begin … =end` block |
+| `.pl`, `.pm`, `.sh`, `.bash`, `.zsh`, `.r`, `.yaml`, `.yml`, `.toml` | `# line` |
 | `.lua` | `-- line`, `--[[ block ]]` |
-| `.r` | `# line` |
 
 > String and triple-quoted literals (including Python docstrings), template literals, and regex literals are deliberately **preserved**, not stripped — they are recognized only so that comment markers appearing inside them (e.g. `//` inside a string) are not mistaken for comments.
 
@@ -325,7 +351,7 @@ diffinite/
 │   ├── evidence.py         # SHA-256 integrity hashing & manifest generation
 │   ├── models.py           # Data classes (DiffResult, DeepMatchResult, etc.)
 │   ├── pdf_gen.py          # PDF/HTML report generation (xhtml2pdf)
-│   └── languages/          # Per-language comment specs (30+ extensions)
+│   └── languages/          # Per-language comment specs (45+ extensions)
 ├── vscode-extension/
 │   ├── src/                # TypeScript extension source
 │   │   ├── extension.ts    # Extension activation & command registration

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "diffinite"
-version = "0.12.1"
+version = "0.12.2"
 description = "Forensic source-code comparison tool — Winnowing fingerprints and professional PDF reports for IP litigation & code audit"
 readme = "README.md"
 license = {text = "Apache-2.0"}

diff --git a/src/diffinite/pdf_gen.py b/src/diffinite/pdf_gen.py
@@ -721,7 +721,7 @@ def build_diff_page_html(
         body = (
             f"<h2>{index}. {html.escape(r.match.rel_path_a)} &harr; "
             f"{html.escape(r.match.rel_path_b)}</h2>\n"
-            f"<p>Match ratio: {_ratio_badge(r.ratio)} &nbsp; "
+            f"<p>Content match: {_ratio_badge(r.ratio)} &nbsp; "
             f"<span style='color:green'>+{r.additions} {unit}(s)</span> &nbsp; "
             f"<span style='color:red'>-{r.deletions} {unit}(s)</span></p>\n"
             f"{r.html_diff}\n"
@@ -821,7 +821,10 @@ def merge_with_bookmarks(
         layout_data.append({
             'file_a': result.match.rel_path_a,
             'file_b': result.match.rel_path_b,
-            'sim': result.match.similarity,
+            'sim': result.match.similarity,   # filename (name) similarity, 0–100
+            'ratio': result.ratio,            # content match (difflib), 0.0–1.0
+            'binary': result.binary,
+            'hash_match': result.hash_match,
             'start_page': page_offset + 1,
             'end_page': page_offset + page_count
         })

diff --git a/src/diffinite/pipeline.py b/src/diffinite/pipeline.py
@@ -312,8 +312,8 @@ def _generate_markdown_report(
 
     # Summary table
     lines.append("## Summary\n")
-    lines.append("| # | File A | File B | Name Sim. | Match | +Added | −Deleted |")
-    lines.append("|---|--------|--------|:---------:|:-----:|:------:|:--------:|")
+    lines.append("| # | File A | File B | Name Sim. | Content Match | +Added | −Deleted |")
+    lines.append("|---|--------|--------|:---------:|:-------------:|:------:|:--------:|")
     for idx, r in enumerate(results, 1):
         if r.binary:
             status = "✓ Match" if r.hash_match else "✗ Mismatch"
@@ -500,7 +500,7 @@ def _generate_html_report(
             diff_sections.append(
                 f'<h2>{idx}. {html_mod.escape(r.match.rel_path_a)} &harr; '
                 f'{html_mod.escape(r.match.rel_path_b)}</h2>\n'
-                f'<p>Match ratio: {_ratio_badge(r.ratio)} &nbsp; '
+                f'<p>Content match: {_ratio_badge(r.ratio)} &nbsp; '
                 f'<span style="color:green">+{r.additions} {unit}(s)</span> &nbsp; '
                 f'<span style="color:red">-{r.deletions} {unit}(s)</span></p>\n'
                 f'{r.html_diff}\n'
@@ -583,7 +583,7 @@ def _generate_individual_html(
             body = (
                 f'<h2>{html_mod.escape(r.match.rel_path_a)} &harr; '
                 f'{html_mod.escape(r.match.rel_path_b)}</h2>\n'
-                f'<p>Match ratio: {_ratio_badge(r.ratio)} &nbsp; '
+                f'<p>Content match: {_ratio_badge(r.ratio)} &nbsp; '
                 f'<span style="color:green">+{r.additions} {unit}(s)</span> &nbsp; '
                 f'<span style="color:red">-{r.deletions} {unit}(s)</span></p>\n'
                 f'{r.html_diff}\n'
@@ -612,7 +612,10 @@ def _generate_individual_html(
             "idx": idx,
             "file_a": r.match.rel_path_a,
             "file_b": r.match.rel_path_b,
-            "ratio": r.ratio,
+            "name_similarity": r.match.similarity,  # fuzzy filename similarity, 0–100
+            "ratio": r.ratio,                       # content match (difflib), 0.0–1.0
+            "binary": r.binary,
+            "hash_match": r.hash_match,
             "additions": r.additions,
             "deletions": r.deletions,
             "link": rel_link,
@@ -643,16 +646,24 @@ def _build_index_html(
     """Generate an index.html with hyperlinks to all individual reports."""
     rows = []
     for e in entries:
-        ratio_pct = f"{e['ratio'] * 100:.1f}%"
+        name_sim = f"{e.get('name_similarity', 0.0):.1f}"
+        # "Content Match" mirrors the cover/CSV: percentage for text pairs,
+        # SHA-256 status for binary pairs (difflib ratio is meaningless there).
         if e["error"]:
-            ratio_pct = f'<span style="color:red">Error</span>'
+            content_match = '<span style="color:red">Error</span>'
+        elif e.get("binary"):
+            hm = e.get("hash_match")
+            content_match = "Binary match" if hm else ("Binary mismatch" if hm is False else "Binary")
+        else:
+            content_match = f"{e['ratio'] * 100:.1f}%"
         rows.append(
             f'<tr>'
             f'<td style="text-align:center">{e["idx"]}</td>'
             f'<td><a href="{html_mod.escape(e["link"])}">'
             f'{html_mod.escape(e["file_a"])}</a></td>'
             f'<td>{html_mod.escape(e["file_b"])}</td>'
-            f'<td style="text-align:center">{ratio_pct}</td>'
+            f'<td style="text-align:center">{name_sim}</td>'
+            f'<td style="text-align:center">{content_match}</td>'
             f'<td style="text-align:center;color:green">+{e["additions"]}</td>'
             f'<td style="text-align:center;color:red">-{e["deletions"]}</td>'
             f'</tr>\n'
@@ -699,7 +710,7 @@ def _build_index_html(
 <thead>
 <tr>
 <th>#</th><th>File A (→ Click to View)</th><th>File B</th>
-<th>Similarity</th><th>Added</th><th>Deleted</th>
+<th>Name Sim.</th><th>Content Match</th><th>Added</th><th>Deleted</th>
 </tr>
 </thead>
 <tbody>
@@ -1314,8 +1325,13 @@ def _generate_pdf_report(
                 with open(csv_path, 'w', newline='', encoding='utf-8-sig') as f:
                     writer = csv.writer(f)
                     col_header = "Bates Range" if show_bates_number else "Page Range"
-                    writer.writerow(["Index", "File A", "File B", "Similarity (%)", col_header])
-
+                    # "Name Sim." = fuzzy filename similarity; "Content Match" = difflib
+                    # content ratio. Keeping them in separate, explicitly-labelled columns
+                    # prevents reading a 100% filename match as identical file content.
+                    writer.writerow(
+                        ["Index", "File A", "File B", "Name Sim. (%)", "Content Match (%)", col_header]
+                    )
+
                     for row_idx, row in enumerate(exhibit_data, 1):
                         start_page = row['start_page']
                         end_page = row['end_page']
@@ -1328,8 +1344,17 @@ def _generate_pdf_report(
                         else:
                             range_str = str(start_page) if start_page == end_page else f"{start_page} - {end_page}"
 
+                        # Content match: percentage for text pairs; for binary pairs
+                        # difflib ratio is meaningless, so report SHA-256 match status.
+                        if row.get('binary'):
+                            hm = row.get('hash_match')
+                            content_str = "Binary match" if hm else ("Binary mismatch" if hm is False else "Binary")
+                        else:
+                            content_str = f"{row.get('ratio', 0.0) * 100:.1f}%"
+
                         writer.writerow([
-                            row_idx, row['file_a'], row['file_b'], f"{row['sim']:.1f}%", range_str
+                            row_idx, row['file_a'], row['file_b'],
+                            f"{row['sim']:.1f}%", content_str, range_str
                         ])
                 logger.info("  Exhibit Index CSV generated → %s", csv_path)
             except Exception as e: