Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 71 additions & 5 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ jobs:
# ─────────────────────────────────────────────
publish-vsce:
name: Publish VSIX (${{ matrix.target }})
# Job-level so the Open VSX step's `if` can read it — a step's own step-level
# env is NOT visible to that same step's `if` condition.
env:
OVSX_PAT: ${{ secrets.OVSX_PAT }}
strategy:
fail-fast: false
matrix:
Expand All @@ -46,6 +50,21 @@ jobs:
with:
node-version: "20"

# Fail fast if the release tag and the packaged version disagree, so we
# never publish a VSIX whose version differs from the tag people see.
- name: Verify version matches release tag
if: github.event_name == 'release'
shell: bash
run: |
TAG="${{ github.event.release.tag_name }}"
VER="${TAG#v}"
PKG=$(node -p "require('./vscode-extension/package.json').version")
echo "tag=$TAG expected=$VER package.json=$PKG"
if [ "$VER" != "$PKG" ]; then
echo "::error::package.json version ($PKG) does not match release tag ($VER)"
exit 1
fi

- name: Install Extension dependencies
working-directory: vscode-extension
run: npm install
Expand Down Expand Up @@ -99,21 +118,55 @@ jobs:
with:
files: vscode-extension/*.vsix

# No continue-on-error: a failed Marketplace publish (e.g. an expired
# VSCE_PAT) MUST turn the job red. The retry loop now records success and
# exits non-zero when all attempts fail, instead of silently ending on the
# trailing Start-Sleep.
- name: Publish to VSCode Marketplace
if: github.event_name == 'release'
working-directory: vscode-extension
timeout-minutes: 10
continue-on-error: true
shell: pwsh
run: |
$vsix = (Get-ChildItem *.vsix | Select-Object -First 1).Name
Write-Output "Publishing $vsix..."
Write-Output "Publishing $vsix to VS Code Marketplace..."
$published = $false
for ($i = 1; $i -le 3; $i++) {
Write-Output "Attempt $i/3..."
vsce publish --packagePath $vsix -p ${{ secrets.VSCE_PAT }}
if ($LASTEXITCODE -eq 0) { break }
Write-Output "Attempt $i failed, waiting 30s..."
Start-Sleep -Seconds 30
if ($LASTEXITCODE -eq 0) { $published = $true; break }
Write-Output "Attempt $i failed (exit $LASTEXITCODE)."
if ($i -lt 3) { Write-Output "Waiting 30s..."; Start-Sleep -Seconds 30 }
}
if (-not $published) {
Write-Error "vsce publish failed after 3 attempts. Is VSCE_PAT valid and unexpired?"
exit 1
}

# Open VSX (for Cursor / VSCodium / Windsurf / Eclipse Theia users).
# Opt-in: skipped unless an OVSX_PAT secret is configured, so it never
# blocks a release before the Open VSX namespace/token is set up.
# Prerequisite (one-time): `ovsx create-namespace nash-dir -p <OVSX_PAT>`.
- name: Publish to Open VSX
if: github.event_name == 'release' && env.OVSX_PAT != ''
working-directory: vscode-extension
timeout-minutes: 10
shell: pwsh
run: |
npm install -g ovsx
$vsix = (Get-ChildItem *.vsix | Select-Object -First 1).Name
Write-Output "Publishing $vsix to Open VSX..."
$published = $false
for ($i = 1; $i -le 3; $i++) {
Write-Output "Attempt $i/3..."
ovsx publish $vsix -p $env:OVSX_PAT
if ($LASTEXITCODE -eq 0) { $published = $true; break }
Write-Output "Attempt $i failed (exit $LASTEXITCODE)."
if ($i -lt 3) { Write-Output "Waiting 30s..."; Start-Sleep -Seconds 30 }
}
if (-not $published) {
Write-Error "ovsx publish failed after 3 attempts. Is OVSX_PAT valid and the namespace created?"
exit 1
}

# ─────────────────────────────────────────────
Expand All @@ -132,6 +185,19 @@ jobs:
steps:
- uses: actions/checkout@v4

- name: Verify version matches release tag
if: github.event_name == 'release'
shell: bash
run: |
TAG="${{ github.event.release.tag_name }}"
VER="${TAG#v}"
PYP=$(grep -m1 '^version' pyproject.toml | sed -E 's/.*"([^"]+)".*/\1/')
echo "tag=$TAG expected=$VER pyproject=$PYP"
if [ "$VER" != "$PYP" ]; then
echo "::error::pyproject.toml version ($PYP) does not match release tag ($VER)"
exit 1
fi

- name: Set up Python
uses: actions/setup-python@v5
with:
Expand Down
10 changes: 9 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,20 @@ vscode-extension/out/
vscode-extension/bin/
vscode-extension/*.vsix

# Example — track only *.py and benchmark/*.md
# Example — track only *.py (root helpers) and benchmark/*.md
# Downloaded data is gitignored (re-download via: python example/download_examples.py)
example/*/
!example/benchmark/
example/benchmark/*
!example/benchmark/*.md
# Generated CLI report outputs left in example/ root
# (timestamped, e.g. left_right_20260622_2035.{csv,html,md,pdf,json};
# *.sig and manifest.sha256.json are already covered by global rules above)
example/*.csv
example/*.html
example/*.md
example/*.pdf
example/*.json

# AI Agent
.agent
Expand Down
50 changes: 38 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ Diffinite runs a two-stage pipeline:
### Stage 1: 1:1 File Matching (`simple` mode)

1. **Fuzzy name matching** — Pairs files across `dir_a` and `dir_b` using [RapidFuzz](https://github.com/rapidfuzz/RapidFuzz) string similarity (configurable threshold).
2. **Comment stripping** — Optionally removes comments using a 6-state finite state machine parser supporting 30+ file extensions.
2. **Comment stripping** — Optionally removes comments using a 6-state finite state machine parser supporting 45+ file extensions.
3. **Side-by-side diff** — Computes line-by-line (or word-by-word) diffs using Python's `difflib.SequenceMatcher` with `autojunk=True`, a heuristic that drops high-frequency lines to speed up matching on large files (`SequenceMatcher` itself remains worst-case quadratic).
4. **Report generation** — Renders syntax-highlighted HTML diffs via Pygments, then converts to PDF with xhtml2pdf.

Expand Down Expand Up @@ -181,12 +181,25 @@ dir_b Path to the comparison source directory (B)
| `--report-json PATH` | Generate machine-readable JSON report (used by VS Code extension) |
| `--no-merge` | Generate individual PDFs per file instead of one merged PDF |
| `--preserve-tree` / `--no-preserve-tree` | Preserve directory tree structure in individual output (default: on) |
| `--sort-by {filename,path,similarity,ratio}` | Sort matched pairs in the report. Default: insertion order (no sort). |
| `--sort-order {asc,desc}` | Sort direction (default: `asc`). Only effective with `--sort-by`. |

### PDF Font / CJK Rendering

Korean, Japanese, and Chinese text is rendered correctly in PDF output. By default the built-in xhtml2pdf CJK font (`HYGothic-Medium`) is used as a fallback; for a specific typeface, supply one of the options below. HTML output relies on the browser's native font fallback and needs no configuration.

| Option | Description |
|--------|-------------|
| `--pdf-lang {ko,ja,zh-cn,…}` | Auto-resolve the best OS-installed font for the given language from the built-in `pdf_fonts.json` map (Windows/macOS/Linux paths). |
| `--pdf-font PATH` | Absolute path to a `.ttf`/`.otf` font to embed via `@font-face` as the primary typeface. Takes precedence over `--pdf-lang`. |

> Extend or override language→font mappings by creating `~/.diffinite_fonts.json` (same schema as the built-in map). User entries are merged over the built-ins per language.

### Diff Options

| Option | Default | Description |
|--------|:-------:|-------------|
| `--strip-comments` | off | Strip comments before comparison (6-state FSM parser, 30+ extensions) |
| `--strip-comments` | off | Strip comments before comparison (6-state FSM parser, 45+ extensions) |
| `--by-word` | off | Compare by word instead of by line |
| `--squash-blanks` | off | Collapse runs of 3+ blank lines. ⚠️ Changes line numbers — not recommended for forensic line-tracing. |
| `--threshold N` | `60` | Fuzzy file-name matching threshold (0–100). Lower = more aggressive matching. |
Expand All @@ -213,6 +226,19 @@ dir_b Path to the comparison source directory (B)
| `--max-file-size N` | `10.0` | Files larger than this (MB) bypass the in-memory text decode and fall back to a SHA-256 hash comparison (reported as match/no-match rather than a line diff). Prevents OOM/CPU lock on large binary/generated files. |
| `--hash` | off | Embed SHA-256 evidence integrity hashes for all analyzed files in the report. |
| `--uncompared-files {inline,separate,none}` | `inline` | Control how unmatched files are displayed: inline in the main report, written to a separate `*_uncompared.txt` file, or omitted. |
| `--bundle PATH` | — | Create an evidence bundle ZIP at `PATH` containing source files, generated reports, and the integrity manifest. |
| `--dir-alias-a TEXT` / `--dir-alias-b TEXT` | — | Display alias for directory A/B in reports (avoids exposing absolute paths). |

### Filtering & Advanced Options

| Option | Default | Description |
|--------|:-------:|-------------|
| `--ignore-file PATH` | — | Path to a `.diffignore` file with glob patterns (e.g. `node_modules`, `*.pyc`) to exclude from analysis. |
| `--binary-handling {exclude,hash,error}` | `hash` | How to handle non-decodable (binary) files: skip, show SHA-256 match status, or report a decode error. |
| `--max-diff-html-size N` | `2.0` | Max HTML diff size (MB) before truncation. Prevents xhtml2pdf OOM/`RecursionError` on huge diffs. |
| `--metrics-only` | off | Phase 1 only: compute similarities and emit JSON, skipping HTML/PDF rendering. |
| `--filter-json PATH` | — | Phase 2: restrict output to the file-A paths listed in a JSON array (pairs with `--metrics-only`). |
| `--unreadable-log PATH` | — | Write the list of files that could not be read (permission errors) to `PATH`. |

### Page Annotation Options

Expand Down Expand Up @@ -291,20 +317,20 @@ diffinite dir_a/ dir_b/ --threshold 80

## Comment Stripping Support

The `--strip-comments` flag removes comments using a 6-state finite state machine parser:
The `--strip-comments` flag removes comments using a 6-state finite state machine parser covering 45+ file extensions:

| Extensions | Comment Styles |
|------------|---------------|
| `.py` | `# line comments` |
| `.js`, `.ts`, `.jsx`, `.tsx` | `// line`, `/* block */` |
| `.java`, `.c`, `.cpp`, `.h`, `.cs`, `.go`, `.rs`, `.kt`, `.scala` | `// line`, `/* block */` |
| `.html`, `.xml`, `.svg`, `.htm` | `<!-- block -->` |
| `.py`, `.pyw` | `# line` (docstrings preserved) |
| `.js`, `.jsx`, `.mjs`, `.ts`, `.tsx` | `// line`, `/* block */`, template literals, regex literals |
| `.java`, `.kt`, `.kts`, `.scala`, `.c`, `.cc`, `.cpp`, `.h`, `.hpp`, `.cs`, `.go`, `.rs`, `.swift` | `// line`, `/* block */` |
| `.html`, `.htm`, `.xml`, `.svg` | `<!-- block -->` |
| `.css`, `.scss`, `.less` | `/* block */` |
| `.sql` | `-- line`, `/* block */` |
| `.rb` | `# line` |
| `.sh`, `.bash`, `.zsh` | `# line` |
| `.sql`, `.ddl`, `.dml`, `.pks`, `.pkb`, `.plsql`, `.tsql` | `-- line`, `/* block */` |
| `.php` | `// line`, `# line`, `/* block */` |
| `.rb` | `# line`, `=begin … =end` block |
| `.pl`, `.pm`, `.sh`, `.bash`, `.zsh`, `.r`, `.yaml`, `.yml`, `.toml` | `# line` |
| `.lua` | `-- line`, `--[[ block ]]` |
| `.r` | `# line` |

> String and triple-quoted literals (including Python docstrings), template literals, and regex literals are deliberately **preserved**, not stripped — they are recognized only so that comment markers appearing inside them (e.g. `//` inside a string) are not mistaken for comments.

Expand All @@ -325,7 +351,7 @@ diffinite/
│ ├── evidence.py # SHA-256 integrity hashing & manifest generation
│ ├── models.py # Data classes (DiffResult, DeepMatchResult, etc.)
│ ├── pdf_gen.py # PDF/HTML report generation (xhtml2pdf)
│ └── languages/ # Per-language comment specs (30+ extensions)
│ └── languages/ # Per-language comment specs (45+ extensions)
├── vscode-extension/
│ ├── src/ # TypeScript extension source
│ │ ├── extension.ts # Extension activation & command registration
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "diffinite"
version = "0.12.1"
version = "0.12.2"
description = "Forensic source-code comparison tool — Winnowing fingerprints and professional PDF reports for IP litigation & code audit"
readme = "README.md"
license = {text = "Apache-2.0"}
Expand Down
7 changes: 5 additions & 2 deletions src/diffinite/pdf_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,7 @@ def build_diff_page_html(
body = (
f"<h2>{index}. {html.escape(r.match.rel_path_a)} &harr; "
f"{html.escape(r.match.rel_path_b)}</h2>\n"
f"<p>Match ratio: {_ratio_badge(r.ratio)} &nbsp; "
f"<p>Content match: {_ratio_badge(r.ratio)} &nbsp; "
f"<span style='color:green'>+{r.additions} {unit}(s)</span> &nbsp; "
f"<span style='color:red'>-{r.deletions} {unit}(s)</span></p>\n"
f"{r.html_diff}\n"
Expand Down Expand Up @@ -821,7 +821,10 @@ def merge_with_bookmarks(
layout_data.append({
'file_a': result.match.rel_path_a,
'file_b': result.match.rel_path_b,
'sim': result.match.similarity,
'sim': result.match.similarity, # filename (name) similarity, 0–100
'ratio': result.ratio, # content match (difflib), 0.0–1.0
'binary': result.binary,
'hash_match': result.hash_match,
'start_page': page_offset + 1,
'end_page': page_offset + page_count
})
Expand Down
49 changes: 37 additions & 12 deletions src/diffinite/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,8 +312,8 @@ def _generate_markdown_report(

# Summary table
lines.append("## Summary\n")
lines.append("| # | File A | File B | Name Sim. | Match | +Added | −Deleted |")
lines.append("|---|--------|--------|:---------:|:-----:|:------:|:--------:|")
lines.append("| # | File A | File B | Name Sim. | Content Match | +Added | −Deleted |")
lines.append("|---|--------|--------|:---------:|:-------------:|:------:|:--------:|")
for idx, r in enumerate(results, 1):
if r.binary:
status = "✓ Match" if r.hash_match else "✗ Mismatch"
Expand Down Expand Up @@ -500,7 +500,7 @@ def _generate_html_report(
diff_sections.append(
f'<h2>{idx}. {html_mod.escape(r.match.rel_path_a)} &harr; '
f'{html_mod.escape(r.match.rel_path_b)}</h2>\n'
f'<p>Match ratio: {_ratio_badge(r.ratio)} &nbsp; '
f'<p>Content match: {_ratio_badge(r.ratio)} &nbsp; '
f'<span style="color:green">+{r.additions} {unit}(s)</span> &nbsp; '
f'<span style="color:red">-{r.deletions} {unit}(s)</span></p>\n'
f'{r.html_diff}\n'
Expand Down Expand Up @@ -583,7 +583,7 @@ def _generate_individual_html(
body = (
f'<h2>{html_mod.escape(r.match.rel_path_a)} &harr; '
f'{html_mod.escape(r.match.rel_path_b)}</h2>\n'
f'<p>Match ratio: {_ratio_badge(r.ratio)} &nbsp; '
f'<p>Content match: {_ratio_badge(r.ratio)} &nbsp; '
f'<span style="color:green">+{r.additions} {unit}(s)</span> &nbsp; '
f'<span style="color:red">-{r.deletions} {unit}(s)</span></p>\n'
f'{r.html_diff}\n'
Expand Down Expand Up @@ -612,7 +612,10 @@ def _generate_individual_html(
"idx": idx,
"file_a": r.match.rel_path_a,
"file_b": r.match.rel_path_b,
"ratio": r.ratio,
"name_similarity": r.match.similarity, # fuzzy filename similarity, 0–100
"ratio": r.ratio, # content match (difflib), 0.0–1.0
"binary": r.binary,
"hash_match": r.hash_match,
"additions": r.additions,
"deletions": r.deletions,
"link": rel_link,
Expand Down Expand Up @@ -643,16 +646,24 @@ def _build_index_html(
"""Generate an index.html with hyperlinks to all individual reports."""
rows = []
for e in entries:
ratio_pct = f"{e['ratio'] * 100:.1f}%"
name_sim = f"{e.get('name_similarity', 0.0):.1f}"
# "Content Match" mirrors the cover/CSV: percentage for text pairs,
# SHA-256 status for binary pairs (difflib ratio is meaningless there).
if e["error"]:
ratio_pct = f'<span style="color:red">Error</span>'
content_match = '<span style="color:red">Error</span>'
elif e.get("binary"):
hm = e.get("hash_match")
content_match = "Binary match" if hm else ("Binary mismatch" if hm is False else "Binary")
else:
content_match = f"{e['ratio'] * 100:.1f}%"
rows.append(
f'<tr>'
f'<td style="text-align:center">{e["idx"]}</td>'
f'<td><a href="{html_mod.escape(e["link"])}">'
f'{html_mod.escape(e["file_a"])}</a></td>'
f'<td>{html_mod.escape(e["file_b"])}</td>'
f'<td style="text-align:center">{ratio_pct}</td>'
f'<td style="text-align:center">{name_sim}</td>'
f'<td style="text-align:center">{content_match}</td>'
f'<td style="text-align:center;color:green">+{e["additions"]}</td>'
f'<td style="text-align:center;color:red">-{e["deletions"]}</td>'
f'</tr>\n'
Expand Down Expand Up @@ -699,7 +710,7 @@ def _build_index_html(
<thead>
<tr>
<th>#</th><th>File A (→ Click to View)</th><th>File B</th>
<th>Similarity</th><th>Added</th><th>Deleted</th>
<th>Name Sim.</th><th>Content Match</th><th>Added</th><th>Deleted</th>
</tr>
</thead>
<tbody>
Expand Down Expand Up @@ -1314,8 +1325,13 @@ def _generate_pdf_report(
with open(csv_path, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
col_header = "Bates Range" if show_bates_number else "Page Range"
writer.writerow(["Index", "File A", "File B", "Similarity (%)", col_header])

# "Name Sim." = fuzzy filename similarity; "Content Match" = difflib
# content ratio. Keeping them in separate, explicitly-labelled columns
# prevents reading a 100% filename match as identical file content.
writer.writerow(
["Index", "File A", "File B", "Name Sim. (%)", "Content Match (%)", col_header]
)

for row_idx, row in enumerate(exhibit_data, 1):
start_page = row['start_page']
end_page = row['end_page']
Expand All @@ -1328,8 +1344,17 @@ def _generate_pdf_report(
else:
range_str = str(start_page) if start_page == end_page else f"{start_page} - {end_page}"

# Content match: percentage for text pairs; for binary pairs
# difflib ratio is meaningless, so report SHA-256 match status.
if row.get('binary'):
hm = row.get('hash_match')
content_str = "Binary match" if hm else ("Binary mismatch" if hm is False else "Binary")
else:
content_str = f"{row.get('ratio', 0.0) * 100:.1f}%"

writer.writerow([
row_idx, row['file_a'], row['file_b'], f"{row['sim']:.1f}%", range_str
row_idx, row['file_a'], row['file_b'],
f"{row['sim']:.1f}%", content_str, range_str
])
logger.info(" Exhibit Index CSV generated → %s", csv_path)
except Exception as e:
Expand Down
Loading
Loading