Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 162 additions & 23 deletions aymurai/text/anonymization/pdf/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,104 @@ def _image_rects_for_clip(
return rects


def _squared_distance_between_rect_centers(
left: pymupdf.Rect,
right: pymupdf.Rect,
) -> float:
"""
Computes the squared distance between two rectangle centers.

Args:
left (pymupdf.Rect): The first rectangle.
right (pymupdf.Rect): The second rectangle.

Returns:
float: The squared distance between rectangle centers.
"""
left_center = ((left.x0 + left.x1) / 2.0, (left.y0 + left.y1) / 2.0)
right_center = ((right.x0 + right.x1) / 2.0, (right.y0 + right.y1) / 2.0)
return (left_center[0] - right_center[0]) ** 2 + (
left_center[1] - right_center[1]
) ** 2


def _refine_signature_text_rect(
page: pymupdf.Page,
entity_text: str,
widget_rect: pymupdf.Rect,
current_rect: pymupdf.Rect,
) -> pymupdf.Rect:
"""
Finds a tighter text rectangle for signer names inside signature widgets.

Args:
page (pymupdf.Page): The PDF page being processed.
entity_text (str): The entity text being mapped.
widget_rect (pymupdf.Rect): The signature widget rectangle.
current_rect (pymupdf.Rect): The currently resolved entity rectangle.

Returns:
pymupdf.Rect: The refined rectangle when available, otherwise current_rect.
"""
widget_clip = pymupdf.Rect(widget_rect)
hits = [
pymupdf.Rect(hit)
for hit in page.search_for(entity_text, clip=widget_clip)
if pymupdf.Rect(hit).intersects(widget_clip)
]
if not hits:
return pymupdf.Rect(current_rect)

target = pymupdf.Rect(current_rect)
intersecting_hits = [hit for hit in hits if hit.intersects(target)]
candidates = intersecting_hits or hits
return pymupdf.Rect(
min(
candidates,
key=lambda hit: _squared_distance_between_rect_centers(hit, target),
)
)


def _build_signature_page_op(
page: pymupdf.Page,
entity_text: str,
widget_info: dict[str, Any],
current_rect: pymupdf.Rect,
token: str,
entity_style: dict[str, Any] | None = None,
) -> dict[str, Any]:
"""
Builds a signature-specific operation scoped to the sensitive text only.

Args:
page (pymupdf.Page): The PDF page being processed.
entity_text (str): The sensitive text being replaced.
widget_info (dict[str, Any]): The signature widget metadata.
current_rect (pymupdf.Rect): The initially resolved text rectangle.
token (str): The logical replacement token.
entity_style (dict[str, Any] | None): The text style to render with.

Returns:
dict[str, Any]: The signature replacement operation.
"""
refined_rect = _refine_signature_text_rect(
page,
entity_text,
widget_info["rect"],
current_rect,
)
op = _build_page_op(
refined_rect,
None,
token,
entity_style=entity_style or widget_info.get("style") or None,
)
op["widget_xref"] = widget_info["xref"]
op["widget_rect"] = widget_info["rect"]
return op


def _entity_overlaps_image(
page: pymupdf.Page,
entity_rect: pymupdf.Rect,
Expand Down Expand Up @@ -303,14 +401,14 @@ def _collect_page_redactions(
fallback_widget["field_type"]
== pymupdf.PDF_WIDGET_TYPE_SIGNATURE
):
op = _build_page_op(
op = _build_signature_page_op(
page,
entity_text,
fallback_widget,
fallback_rects[0],
lines[0] if lines else None,
token,
entity_style=fallback_widget.get("style") or None,
)
op["widget_xref"] = fallback_widget["xref"]
op["widget_rect"] = fallback_widget["rect"]
signature_widget_ops.setdefault(page_index, []).append(op)
continue

Expand Down Expand Up @@ -470,14 +568,14 @@ def _collect_page_redactions(
)
continue
if widget_info["field_type"] == pymupdf.PDF_WIDGET_TYPE_SIGNATURE:
op = _build_page_op(
op = _build_signature_page_op(
page,
entity_text,
widget_info,
rect,
line,
token,
entity_style=ent_style,
)
op["widget_xref"] = widget_info["xref"]
op["widget_rect"] = widget_info["rect"]
signature_widget_ops.setdefault(page_index, []).append(op)
continue

Expand Down Expand Up @@ -514,43 +612,51 @@ def _collect_page_redactions(

for seg_idx, (
seg_line,
_seg_text,
seg_text,
seg_rect,
seg_img,
seg_style,
seg_widget,
) in enumerate(segments):
if signature_widget is not None:
op = _build_signature_page_op(
page,
seg_text,
signature_widget,
seg_rect,
token,
entity_style=seg_style,
)
if seg_idx != widest_idx:
op["text"] = None
op["fontsize"] = None
signature_widget_ops.setdefault(page_index, []).append(op)
continue

if seg_idx == widest_idx:
op = _build_page_op(
seg_rect,
seg_line,
token,
is_image=(any_image and signature_widget is None),
is_image=any_image,
entity_style=seg_style,
)
if signature_widget is None and shared_image_rect is not None:
if shared_image_rect is not None:
op["image_rect"] = shared_image_rect
else:
op = _build_page_op(
seg_rect,
seg_line,
token,
is_image=(
(seg_img is not None) and signature_widget is None
),
is_image=(seg_img is not None),
entity_style=seg_style,
)
op["text"] = None
op["fontsize"] = None
if seg_img is not None and signature_widget is None:
if seg_img is not None:
op["image_rect"] = seg_img

if signature_widget is not None:
op["widget_xref"] = signature_widget["xref"]
op["widget_rect"] = signature_widget["rect"]
signature_widget_ops.setdefault(page_index, []).append(op)
else:
page_ops.setdefault(page_index, []).append(op)
page_ops.setdefault(page_index, []).append(op)

return page_ops, widget_ops, signature_widget_ops

Expand Down Expand Up @@ -802,6 +908,40 @@ def _apply_asset_redactions(
_render_text_op(page, op)


def _apply_signature_redactions(
doc: pymupdf.Document,
signature_widget_ops: dict[int, list[dict]],
) -> None:
"""
Applies signer-name redactions without removing the full signature appearance.

Args:
doc (pymupdf.Document): The PDF document being processed.
signature_widget_ops (dict[int, list[dict]]): The signature operations grouped by page index.
"""
for page_idx, ops in signature_widget_ops.items():
if not ops:
continue

page = doc[page_idx]
for op in ops:
page.add_redact_annot(
op["redact_rect"],
text=None,
fill=(1, 1, 1),
cross_out=False,
)

page.apply_redactions(
images=pymupdf.PDF_REDACT_IMAGE_PIXELS,
graphics=pymupdf.PDF_REDACT_LINE_ART_NONE,
text=pymupdf.PDF_REDACT_TEXT_REMOVE,
)

for op in ops:
_render_text_op(page, op)


def _apply_redactions(
doc: pymupdf.Document,
page_ops: dict[int, list[dict]],
Expand All @@ -821,8 +961,7 @@ def _apply_redactions(
_prepare_signature_widget_ops(doc, signature_widget_ops)

text_page_ops, asset_page_ops = _partition_page_ops(page_ops)
for page_idx, ops in signature_widget_ops.items():
asset_page_ops.setdefault(page_idx, []).extend(ops)

_apply_text_redactions(doc, text_page_ops)
_apply_asset_redactions(doc, asset_page_ops)
_apply_signature_redactions(doc, signature_widget_ops)
12 changes: 5 additions & 7 deletions aymurai/text/anonymization/pdf/sanitize.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,14 +96,12 @@ def _cleanup_rect_for_signature_widget_op(op: dict[str, Any]) -> pymupdf.Rect |
Returns:
pymupdf.Rect | None: The cleanup rectangle for the signature widget operation, if available.
"""
widget_rect = op.get("widget_rect")
if widget_rect is not None:
return pymupdf.Rect(widget_rect)

background_rect = op.get("background_rect") or op.get("canvas_rect")
if background_rect is None:
cleanup_source = (
op.get("redact_rect") or op.get("background_rect") or op.get("canvas_rect")
)
if cleanup_source is None:
return None
return pymupdf.Rect(background_rect)
return pymupdf.Rect(cleanup_source)


def _collect_link_cleanup_rects(
Expand Down
51 changes: 30 additions & 21 deletions aymurai/text/anonymization/pdf/widgets.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,24 @@ def _signature_background_rect(
Returns:
pymupdf.Rect: The background rectangle for the signature replacement.
"""
background = pymupdf.Rect(
op.get("line_rect") or op.get("canvas_rect") or widget_rect
background_source = (
op.get("canvas_rect") or op.get("redact_rect") or op.get("line_rect")
)
canvas_rect = op.get("canvas_rect")
if canvas_rect is not None:
background.include_rect(pymupdf.Rect(canvas_rect))
background = pymupdf.Rect(background_source or widget_rect)

pad_x = max(background.height * 0.75, 2.0)
pad_y = max(background.height * 0.25, 0.75)
redact_rect = op.get("redact_rect")
if redact_rect is not None:
background.include_rect(pymupdf.Rect(redact_rect))

# Keep the repaint area on the sensitive text line so the replacement
# background does not visually cover adjacent non-sensitive content
pad_x = max(background.height * 0.2, 0.5)
widget_clip = pymupdf.Rect(widget_rect)

background.x0 = max(widget_clip.x0, background.x0 - pad_x)
background.y0 = max(widget_clip.y0, background.y0 - pad_y)
background.y0 = max(widget_clip.y0, background.y0)
background.x1 = min(widget_clip.x1, background.x1 + pad_x)
background.y1 = min(widget_clip.y1, background.y1 + pad_y)
background.y1 = min(widget_clip.y1, background.y1)
return background


Expand Down Expand Up @@ -272,12 +275,19 @@ def _prepare_signature_widget_ops(
signature_widget_ops: dict[int, list[dict]],
) -> None:
"""
Deletes signature widgets and prepares their replacement operations.
Flattens signature widgets and prepares their replacement operations.

PyMuPDF bakes widgets at document scope, not per widget. When a
signature widget must be flattened, all widgets are intentionally baked
before sanitization so their visible appearances survive in the static
anonymized PDF.

Args:
doc (pymupdf.Document): The PDF document being processed.
signature_widget_ops (dict[int, list[dict]]): The collected signature widget operations grouped by page index.
"""
should_bake_widgets = False

for page_idx, ops in signature_widget_ops.items():
if not ops:
continue
Expand All @@ -300,15 +310,7 @@ def _prepare_signature_widget_ops(

if widget is not None:
widget_rect = pymupdf.Rect(widget.rect)
try:
page.delete_widget(widget)
except Exception as exc:
logger.warning(
"Failed to delete signature widget xref=%s on page=%s: %s",
widget_xref,
page_idx,
exc,
)
should_bake_widgets = True
else:
logger.warning(
"Could not resolve PDF signature widget xref=%s on page=%s",
Expand All @@ -318,6 +320,13 @@ def _prepare_signature_widget_ops(

for op in widget_group_ops:
op["widget_rect"] = pymupdf.Rect(widget_rect)
op["asset_rect"] = pymupdf.Rect(widget_rect)
op["graphics_mode"] = pymupdf.PDF_REDACT_LINE_ART_REMOVE_IF_COVERED
op.pop("asset_rect", None)
op.pop("image_rect", None)
op.pop("graphics_mode", None)
op["background_rect"] = _signature_background_rect(op, widget_rect)

if should_bake_widgets:
try:
doc.bake(annots=False, widgets=True)
except Exception as exc:
logger.warning("Failed to flatten PDF signature widgets: %s", exc)
Comment on lines +328 to +332
Loading
Loading