diff --git a/aymurai/text/anonymization/pdf/ops.py b/aymurai/text/anonymization/pdf/ops.py index bdad1d0..fb5a324 100644 --- a/aymurai/text/anonymization/pdf/ops.py +++ b/aymurai/text/anonymization/pdf/ops.py @@ -189,6 +189,104 @@ def _image_rects_for_clip( return rects +def _squared_distance_between_rect_centers( + left: pymupdf.Rect, + right: pymupdf.Rect, +) -> float: + """ + Computes the squared distance between two rectangle centers. + + Args: + left (pymupdf.Rect): The first rectangle. + right (pymupdf.Rect): The second rectangle. + + Returns: + float: The squared distance between rectangle centers. + """ + left_center = ((left.x0 + left.x1) / 2.0, (left.y0 + left.y1) / 2.0) + right_center = ((right.x0 + right.x1) / 2.0, (right.y0 + right.y1) / 2.0) + return (left_center[0] - right_center[0]) ** 2 + ( + left_center[1] - right_center[1] + ) ** 2 + + +def _refine_signature_text_rect( + page: pymupdf.Page, + entity_text: str, + widget_rect: pymupdf.Rect, + current_rect: pymupdf.Rect, +) -> pymupdf.Rect: + """ + Finds a tighter text rectangle for signer names inside signature widgets. + + Args: + page (pymupdf.Page): The PDF page being processed. + entity_text (str): The entity text being mapped. + widget_rect (pymupdf.Rect): The signature widget rectangle. + current_rect (pymupdf.Rect): The currently resolved entity rectangle. + + Returns: + pymupdf.Rect: The refined rectangle when available, otherwise current_rect. + """ + widget_clip = pymupdf.Rect(widget_rect) + hits = [ + pymupdf.Rect(hit) + for hit in page.search_for(entity_text, clip=widget_clip) + if pymupdf.Rect(hit).intersects(widget_clip) + ] + if not hits: + return pymupdf.Rect(current_rect) + + target = pymupdf.Rect(current_rect) + intersecting_hits = [hit for hit in hits if hit.intersects(target)] + candidates = intersecting_hits or hits + return pymupdf.Rect( + min( + candidates, + key=lambda hit: _squared_distance_between_rect_centers(hit, target), + ) + ) + + +def _build_signature_page_op( + page: pymupdf.Page, + entity_text: str, + widget_info: dict[str, Any], + current_rect: pymupdf.Rect, + token: str, + entity_style: dict[str, Any] | None = None, +) -> dict[str, Any]: + """ + Builds a signature-specific operation scoped to the sensitive text only. + + Args: + page (pymupdf.Page): The PDF page being processed. + entity_text (str): The sensitive text being replaced. + widget_info (dict[str, Any]): The signature widget metadata. + current_rect (pymupdf.Rect): The initially resolved text rectangle. + token (str): The logical replacement token. + entity_style (dict[str, Any] | None): The text style to render with. + + Returns: + dict[str, Any]: The signature replacement operation. + """ + refined_rect = _refine_signature_text_rect( + page, + entity_text, + widget_info["rect"], + current_rect, + ) + op = _build_page_op( + refined_rect, + None, + token, + entity_style=entity_style or widget_info.get("style") or None, + ) + op["widget_xref"] = widget_info["xref"] + op["widget_rect"] = widget_info["rect"] + return op + + def _entity_overlaps_image( page: pymupdf.Page, entity_rect: pymupdf.Rect, @@ -303,14 +401,14 @@ def _collect_page_redactions( fallback_widget["field_type"] == pymupdf.PDF_WIDGET_TYPE_SIGNATURE ): - op = _build_page_op( + op = _build_signature_page_op( + page, + entity_text, + fallback_widget, fallback_rects[0], - lines[0] if lines else None, token, entity_style=fallback_widget.get("style") or None, ) - op["widget_xref"] = fallback_widget["xref"] - op["widget_rect"] = fallback_widget["rect"] signature_widget_ops.setdefault(page_index, []).append(op) continue @@ -470,14 +568,14 @@ def _collect_page_redactions( ) continue if widget_info["field_type"] == pymupdf.PDF_WIDGET_TYPE_SIGNATURE: - op = _build_page_op( + op = _build_signature_page_op( + page, + entity_text, + widget_info, rect, - line, token, entity_style=ent_style, ) - op["widget_xref"] = widget_info["xref"] - op["widget_rect"] = widget_info["rect"] signature_widget_ops.setdefault(page_index, []).append(op) continue @@ -514,43 +612,51 @@ def _collect_page_redactions( for seg_idx, ( seg_line, - _seg_text, + seg_text, seg_rect, seg_img, seg_style, seg_widget, ) in enumerate(segments): + if signature_widget is not None: + op = _build_signature_page_op( + page, + seg_text, + signature_widget, + seg_rect, + token, + entity_style=seg_style, + ) + if seg_idx != widest_idx: + op["text"] = None + op["fontsize"] = None + signature_widget_ops.setdefault(page_index, []).append(op) + continue + if seg_idx == widest_idx: op = _build_page_op( seg_rect, seg_line, token, - is_image=(any_image and signature_widget is None), + is_image=any_image, entity_style=seg_style, ) - if signature_widget is None and shared_image_rect is not None: + if shared_image_rect is not None: op["image_rect"] = shared_image_rect else: op = _build_page_op( seg_rect, seg_line, token, - is_image=( - (seg_img is not None) and signature_widget is None - ), + is_image=(seg_img is not None), entity_style=seg_style, ) op["text"] = None op["fontsize"] = None - if seg_img is not None and signature_widget is None: + if seg_img is not None: op["image_rect"] = seg_img - if signature_widget is not None: - op["widget_xref"] = signature_widget["xref"] - op["widget_rect"] = signature_widget["rect"] - signature_widget_ops.setdefault(page_index, []).append(op) - else: - page_ops.setdefault(page_index, []).append(op) + page_ops.setdefault(page_index, []).append(op) return page_ops, widget_ops, signature_widget_ops @@ -802,6 +908,40 @@ def _apply_asset_redactions( _render_text_op(page, op) +def _apply_signature_redactions( + doc: pymupdf.Document, + signature_widget_ops: dict[int, list[dict]], +) -> None: + """ + Applies signer-name redactions without removing the full signature appearance. + + Args: + doc (pymupdf.Document): The PDF document being processed. + signature_widget_ops (dict[int, list[dict]]): The signature operations grouped by page index. + """ + for page_idx, ops in signature_widget_ops.items(): + if not ops: + continue + + page = doc[page_idx] + for op in ops: + page.add_redact_annot( + op["redact_rect"], + text=None, + fill=(1, 1, 1), + cross_out=False, + ) + + page.apply_redactions( + images=pymupdf.PDF_REDACT_IMAGE_PIXELS, + graphics=pymupdf.PDF_REDACT_LINE_ART_NONE, + text=pymupdf.PDF_REDACT_TEXT_REMOVE, + ) + + for op in ops: + _render_text_op(page, op) + + def _apply_redactions( doc: pymupdf.Document, page_ops: dict[int, list[dict]], @@ -821,8 +961,7 @@ def _apply_redactions( _prepare_signature_widget_ops(doc, signature_widget_ops) text_page_ops, asset_page_ops = _partition_page_ops(page_ops) - for page_idx, ops in signature_widget_ops.items(): - asset_page_ops.setdefault(page_idx, []).extend(ops) _apply_text_redactions(doc, text_page_ops) _apply_asset_redactions(doc, asset_page_ops) + _apply_signature_redactions(doc, signature_widget_ops) diff --git a/aymurai/text/anonymization/pdf/sanitize.py b/aymurai/text/anonymization/pdf/sanitize.py index 408f32b..ab1bf34 100644 --- a/aymurai/text/anonymization/pdf/sanitize.py +++ b/aymurai/text/anonymization/pdf/sanitize.py @@ -96,14 +96,12 @@ def _cleanup_rect_for_signature_widget_op(op: dict[str, Any]) -> pymupdf.Rect | Returns: pymupdf.Rect | None: The cleanup rectangle for the signature widget operation, if available. """ - widget_rect = op.get("widget_rect") - if widget_rect is not None: - return pymupdf.Rect(widget_rect) - - background_rect = op.get("background_rect") or op.get("canvas_rect") - if background_rect is None: + cleanup_source = ( + op.get("redact_rect") or op.get("background_rect") or op.get("canvas_rect") + ) + if cleanup_source is None: return None - return pymupdf.Rect(background_rect) + return pymupdf.Rect(cleanup_source) def _collect_link_cleanup_rects( diff --git a/aymurai/text/anonymization/pdf/widgets.py b/aymurai/text/anonymization/pdf/widgets.py index 3ea97d7..266a912 100644 --- a/aymurai/text/anonymization/pdf/widgets.py +++ b/aymurai/text/anonymization/pdf/widgets.py @@ -29,21 +29,24 @@ def _signature_background_rect( Returns: pymupdf.Rect: The background rectangle for the signature replacement. """ - background = pymupdf.Rect( - op.get("line_rect") or op.get("canvas_rect") or widget_rect + background_source = ( + op.get("canvas_rect") or op.get("redact_rect") or op.get("line_rect") ) - canvas_rect = op.get("canvas_rect") - if canvas_rect is not None: - background.include_rect(pymupdf.Rect(canvas_rect)) + background = pymupdf.Rect(background_source or widget_rect) - pad_x = max(background.height * 0.75, 2.0) - pad_y = max(background.height * 0.25, 0.75) + redact_rect = op.get("redact_rect") + if redact_rect is not None: + background.include_rect(pymupdf.Rect(redact_rect)) + + # Keep the repaint area on the sensitive text line so the replacement + # background does not visually cover adjacent non-sensitive content + pad_x = max(background.height * 0.2, 0.5) widget_clip = pymupdf.Rect(widget_rect) background.x0 = max(widget_clip.x0, background.x0 - pad_x) - background.y0 = max(widget_clip.y0, background.y0 - pad_y) + background.y0 = max(widget_clip.y0, background.y0) background.x1 = min(widget_clip.x1, background.x1 + pad_x) - background.y1 = min(widget_clip.y1, background.y1 + pad_y) + background.y1 = min(widget_clip.y1, background.y1) return background @@ -272,12 +275,19 @@ def _prepare_signature_widget_ops( signature_widget_ops: dict[int, list[dict]], ) -> None: """ - Deletes signature widgets and prepares their replacement operations. + Flattens signature widgets and prepares their replacement operations. + + PyMuPDF bakes widgets at document scope, not per widget. When a + signature widget must be flattened, all widgets are intentionally baked + before sanitization so their visible appearances survive in the static + anonymized PDF. Args: doc (pymupdf.Document): The PDF document being processed. signature_widget_ops (dict[int, list[dict]]): The collected signature widget operations grouped by page index. """ + should_bake_widgets = False + for page_idx, ops in signature_widget_ops.items(): if not ops: continue @@ -300,15 +310,7 @@ def _prepare_signature_widget_ops( if widget is not None: widget_rect = pymupdf.Rect(widget.rect) - try: - page.delete_widget(widget) - except Exception as exc: - logger.warning( - "Failed to delete signature widget xref=%s on page=%s: %s", - widget_xref, - page_idx, - exc, - ) + should_bake_widgets = True else: logger.warning( "Could not resolve PDF signature widget xref=%s on page=%s", @@ -318,6 +320,13 @@ def _prepare_signature_widget_ops( for op in widget_group_ops: op["widget_rect"] = pymupdf.Rect(widget_rect) - op["asset_rect"] = pymupdf.Rect(widget_rect) - op["graphics_mode"] = pymupdf.PDF_REDACT_LINE_ART_REMOVE_IF_COVERED + op.pop("asset_rect", None) + op.pop("image_rect", None) + op.pop("graphics_mode", None) op["background_rect"] = _signature_background_rect(op, widget_rect) + + if should_bake_widgets: + try: + doc.bake(annots=False, widgets=True) + except Exception as exc: + logger.warning("Failed to flatten PDF signature widgets: %s", exc) diff --git a/tests/api/routers/anonymizer/test_anonymizer.py b/tests/api/routers/anonymizer/test_anonymizer.py index 8225e29..0648c62 100644 --- a/tests/api/routers/anonymizer/test_anonymizer.py +++ b/tests/api/routers/anonymizer/test_anonymizer.py @@ -13,14 +13,20 @@ from aymurai.database.schema import AnonymizationParagraph from aymurai.database.utils import text_to_uuid +from aymurai.meta.api_interfaces import LabelPolicy, RenderPolicy from aymurai.text.anonymization import DocxAnonymizer, PdfAnonymizer, get_anonymizer from aymurai.text.anonymization.alignment import index_paragraphs +from aymurai.text.anonymization.pdf.ops import _refine_signature_text_rect +from aymurai.text.anonymization.pdf.widgets import _signature_background_rect from tests.api.conftest import build_label from tests.api.routers.conftest import build_mock_pipeline PNG_1X1 = base64.b64decode( "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/x8AAwMCAO+a6R8AAAAASUVORK5CYII=" ) +PNG_BLACK_1X1 = base64.b64decode( + "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAAAAAA6fptVAAAACklEQVR42mNgAAAAAgAB4iG8MwAAAABJRU5ErkJggg==" +) WATERMARK_URL = "https://www.aymurai.info/" WINDOWS_PYMUPDF_LAYOUT_XFAIL = pytest.mark.xfail( @@ -50,6 +56,7 @@ def _run_pdf_anonymizer( source_path: Path, document: str, labels: list[dict], + render_context: dict | None = None, ) -> Path: output_dir = tmp_path / "out" output_dir.mkdir(exist_ok=True) @@ -57,10 +64,174 @@ def _run_pdf_anonymizer( {"path": str(source_path)}, [{"document": document, "labels": labels}], str(output_dir), + render_context=render_context, ) return Path(output_path) +def _label_for_document_text(document: str, text: str, label: str = "PER") -> dict: + payload = _label_dict(text, label) + start = document.index(text) + payload["start_char"] = start + payload["end_char"] = start + len(text) + return payload + + +def _render_context_for_entities(labels: list[dict]) -> dict: + index_by_entity = {} + next_index_by_base = {} + for label in labels: + attrs = label.get("attrs") or {} + base = attrs.get("aymurai_label") or label.get("label") or "ENT" + entity_id = str(attrs.get("canonical_entity_id") or label.get("text")) + key = (base, entity_id) + if key not in index_by_entity: + next_index_by_base[base] = next_index_by_base.get(base, 0) + 1 + index_by_entity[key] = next_index_by_base[base] + + return { + "render_policy": RenderPolicy(suffix_mode="always", suffix_threshold=0), + "label_policies": {"PER": LabelPolicy()}, + "count_by_base": dict(next_index_by_base), + "index_by_entity": index_by_entity, + } + + +def _dark_pixel_ratio(page: pymupdf.Page, rect: pymupdf.Rect) -> float: + pixmap = page.get_pixmap( + matrix=pymupdf.Matrix(2, 2), + clip=rect, + alpha=False, + ) + samples = pixmap.samples + if not samples: + return 0.0 + + channels = pixmap.n + dark_pixels = 0 + total_pixels = pixmap.width * pixmap.height + for offset in range(0, len(samples), channels): + if all(channel < 96 for channel in samples[offset : offset + 3]): + dark_pixels += 1 + + return dark_pixels / max(total_pixels, 1) + + +def _assert_text_count(page_text: str, text: str, expected: int) -> None: + assert page_text.count(text) == expected, page_text + + +def _assert_rect_close(actual: pymupdf.Rect, expected: pymupdf.Rect) -> None: + assert (actual.x0, actual.y0, actual.x1, actual.y1) == pytest.approx( + (expected.x0, expected.y0, expected.x1, expected.y1) + ) + + +def _write_variable_signature_pdf( + path: Path, +) -> tuple[Path, list[dict], list[str], list[str], list[pymupdf.Rect]]: + blocks = [ + { + "origin": (58, 112), + "lines": [ + "Mesa de Control 42", + "Adriana Morales", + "Area de Validacion", + "Codigo A-17", + ], + "signer": "Adriana Morales", + "qr": "top", + }, + { + "origin": (326, 112), + "lines": [ + "Bernardo Diaz", + "Direccion Legal", + "Organismo Beta Sur", + "Tramite BX-900", + ], + "signer": "Bernardo Diaz", + "qr": "right", + }, + { + "origin": (58, 328), + "lines": [ + "Centro Operativo", + "Carolina Ruiz", + "Secretaria Tecnica", + "2026-05-26 10:15", + ], + "signer": "Carolina Ruiz", + "qr": "left", + }, + { + "origin": (326, 328), + "lines": [ + "Unidad Regional", + "Coordinacion de Revision", + "Daniel Silva", + "Expediente Digital Z-42", + ], + "signer": "Daniel Silva", + "qr": "top", + }, + { + "origin": (58, 544), + "lines": [ + "Responsable: Elena Torres - Acta Final", + "Delegacion Gamma", + "Registro Interno R-204", + ], + "signer": "Elena Torres", + "qr": "right", + }, + ] + + doc = pymupdf.open() + page = doc.new_page() + preds: list[dict] = [] + preserved_texts: list[str] = [] + signers: list[str] = [] + qr_rects: list[pymupdf.Rect] = [] + + for idx, block in enumerate(blocks): + x, y = block["origin"] + if block["qr"] == "right": + qr_rect = pymupdf.Rect(x + 150, y - 4, x + 182, y + 28) + elif block["qr"] == "left": + qr_rect = pymupdf.Rect(x - 2, y - 48, x + 30, y - 16) + else: + qr_rect = pymupdf.Rect(x, y - 52, x + 32, y - 20) + page.insert_image(qr_rect, stream=PNG_BLACK_1X1) + qr_rects.append(qr_rect) + + for line_idx, line in enumerate(block["lines"]): + page.insert_text((x, y + (line_idx * 16)), line, fontsize=11) + if line == block["signer"]: + continue + if block["signer"] in line: + preserved_texts.extend( + part.strip() for part in line.split(block["signer"]) if part.strip() + ) + else: + preserved_texts.append(line) + + widget = pymupdf.Widget() + widget.field_name = f"sig_{idx}" + widget.field_type = pymupdf.PDF_WIDGET_TYPE_SIGNATURE + widget.rect = pymupdf.Rect(x - 12, y - 62, x + 230, y + 64) + page.add_widget(widget) + + document = "\n".join(block["lines"]) + label = _label_for_document_text(document, block["signer"]) + preds.append({"document": document, "labels": [label]}) + signers.append(block["signer"]) + + doc.save(path) + doc.close() + return path, preds, signers, preserved_texts, qr_rects + + @pytest.mark.integration def test_anonymization_package_exports_and_registry_are_stable(): assert PdfAnonymizer.__name__ == "PdfAnonymizer" @@ -218,17 +389,234 @@ def test_pdf_anonymizer_removes_image_backed_entities(tmp_path): assert "" in page_text +def test_signature_background_rect_stays_on_signer_name_line(): + background = _signature_background_rect( + { + "line_rect": pymupdf.Rect(80, 70, 220, 118), + "canvas_rect": pymupdf.Rect(112, 70, 145, 82), + "redact_rect": pymupdf.Rect(112, 70, 145, 82), + }, + pymupdf.Rect(60, 60, 230, 130), + ) + + assert background.y0 >= 70 + assert background.y1 <= 82 + + +def test_signature_text_rect_refinement_does_not_include_role_text(tmp_path): + source_path = _write_pdf( + tmp_path / "signature-role.pdf", + lambda _doc, page: ( + page.insert_text((100, 80), "RUIZ"), + page.insert_text((100, 96), "JUEZ/A"), + ), + ) + + with pymupdf.open(source_path) as doc: + page = doc[0] + signer_rect = page.search_for("RUIZ")[0] + role_rect = page.search_for("JUEZ/A")[0] + loose_rect = pymupdf.Rect(signer_rect) + loose_rect.include_rect(role_rect) + + refined = _refine_signature_text_rect( + page, + "RUIZ", + pymupdf.Rect(80, 60, 200, 115), + loose_rect, + ) + + assert refined.intersects(signer_rect) + assert not refined.intersects(role_rect) + + +def test_signature_text_rect_refinement_returns_current_rect_when_no_hit_in_widget( + tmp_path, +): + source_path = _write_pdf( + tmp_path / "signature-no-hit.pdf", + lambda _doc, page: page.insert_text((260, 80), "RUIZ"), + ) + + with pymupdf.open(source_path) as doc: + page = doc[0] + current_rect = pymupdf.Rect(100, 72, 130, 84) + + refined = _refine_signature_text_rect( + page, + "RUIZ", + pymupdf.Rect(80, 60, 180, 115), + current_rect, + ) + + _assert_rect_close(refined, current_rect) + + +def test_signature_text_rect_refinement_selects_closest_matching_hit(tmp_path): + source_path = _write_pdf( + tmp_path / "signature-multiple-hits.pdf", + lambda _doc, page: ( + page.insert_text((100, 80), "RUIZ"), + page.insert_text((220, 80), "RUIZ"), + ), + ) + + with pymupdf.open(source_path) as doc: + page = doc[0] + left_rect, right_rect = page.search_for("RUIZ") + target = pymupdf.Rect(right_rect) + target.x0 += 2 + target.x1 += 2 + + refined = _refine_signature_text_rect( + page, + "RUIZ", + pymupdf.Rect(80, 60, 280, 115), + target, + ) + + assert refined.intersects(right_rect) + assert not refined.intersects(left_rect) + + +@pytest.mark.integration +@WINDOWS_PYMUPDF_LAYOUT_XFAIL +def test_pdf_anonymizer_only_redacts_marked_signature_names_in_variable_layouts( + tmp_path, +): + source_path, preds, signers, preserved_texts, qr_rects = ( + _write_variable_signature_pdf(tmp_path / "variable-signatures.pdf") + ) + render_context = _render_context_for_entities([pred["labels"][0] for pred in preds]) + output_dir = tmp_path / "out-variable" + output_dir.mkdir(exist_ok=True) + + output_path = PdfAnonymizer().anonymize( + {"path": str(source_path)}, + preds, + str(output_dir), + render_context=render_context, + ) + + with pymupdf.open(output_path) as output_doc: + page = output_doc[0] + page_text = page.get_text() + + assert list(page.widgets() or []) == [] + assert len(page.get_image_info()) >= len(qr_rects) + + for signer in signers: + assert signer not in page_text + + for index in range(1, len(signers) + 1): + assert f"" in page_text + + for preserved_text in preserved_texts: + _assert_text_count(page_text, preserved_text, 1) + + for qr_rect in qr_rects: + assert _dark_pixel_ratio(page, qr_rect) > 0.25 + + +@pytest.mark.integration +@WINDOWS_PYMUPDF_LAYOUT_XFAIL +def test_pdf_anonymizer_leaves_unlabeled_signature_names_visible(tmp_path): + source_path, preds, signers, preserved_texts, qr_rects = ( + _write_variable_signature_pdf(tmp_path / "partially-labeled-signatures.pdf") + ) + unlabeled_signer = signers[-1] + filtered_preds = [] + filtered_labels = [] + for pred, signer in zip(preds, signers, strict=True): + labels = [] if signer == unlabeled_signer else pred["labels"] + filtered_preds.append({**pred, "labels": labels}) + filtered_labels.extend(labels) + + render_context = _render_context_for_entities(filtered_labels) + output_dir = tmp_path / "out-partial" + output_dir.mkdir(exist_ok=True) + + output_path = PdfAnonymizer().anonymize( + {"path": str(source_path)}, + filtered_preds, + str(output_dir), + render_context=render_context, + ) + + with pymupdf.open(output_path) as output_doc: + page = output_doc[0] + page_text = page.get_text() + + assert list(page.widgets() or []) == [] + assert unlabeled_signer in page_text + assert "" not in page_text + + for index, signer in enumerate(signers[:-1], start=1): + assert signer not in page_text + assert f"" in page_text + + for preserved_text in preserved_texts: + _assert_text_count(page_text, preserved_text, 1) + + for qr_rect in qr_rects: + assert _dark_pixel_ratio(page, qr_rect) > 0.25 + + +@pytest.mark.integration +@WINDOWS_PYMUPDF_LAYOUT_XFAIL +def test_pdf_anonymizer_preserves_non_signature_widget_appearance_when_baking( + tmp_path, +): + def configure(_doc: pymupdf.Document, page: pymupdf.Page) -> None: + page.insert_text((80, 88), "Ana Perez") + + text_widget = pymupdf.Widget() + text_widget.field_name = "public_field" + text_widget.field_type = pymupdf.PDF_WIDGET_TYPE_TEXT + text_widget.field_value = "Visible Field Value" + text_widget.text_font = "Helv" + text_widget.text_fontsize = 10 + text_widget.rect = pymupdf.Rect(260, 70, 410, 96) + page.add_widget(text_widget) + + signature_widget = pymupdf.Widget() + signature_widget.field_name = "sig_1" + signature_widget.field_type = pymupdf.PDF_WIDGET_TYPE_SIGNATURE + signature_widget.rect = pymupdf.Rect(60, 60, 180, 110) + page.add_widget(signature_widget) + + source_path = _write_pdf(tmp_path / "signature-and-text-widget.pdf", configure) + output_path = _run_pdf_anonymizer( + tmp_path, + source_path, + "Ana Perez", + [_label_dict("Ana Perez")], + ) + + with pymupdf.open(output_path) as output_doc: + page = output_doc[0] + page_text = page.get_text() + + assert list(page.widgets() or []) == [] + assert "Visible Field Value" in page_text + assert "Ana Perez" not in page_text + assert "" in page_text + + @pytest.mark.integration @WINDOWS_PYMUPDF_LAYOUT_XFAIL -def test_pdf_anonymizer_removes_signature_widgets_without_restoring_appearance( +def test_pdf_anonymizer_preserves_signature_appearance_when_redacting_signer_name( tmp_path, ): def configure(_doc: pymupdf.Document, page: pymupdf.Page) -> None: - page.insert_text((80, 90), "Ana Perez") + page.insert_text((80, 76), "FIRMADO DIGITALMENTE") + page.insert_text((80, 92), "05/02/2025 14:17") + page.insert_text((80, 108), "Ana Perez") + page.insert_image(pymupdf.Rect(185, 68, 215, 98), stream=PNG_1X1) widget = pymupdf.Widget() widget.field_name = "sig_1" widget.field_type = pymupdf.PDF_WIDGET_TYPE_SIGNATURE - widget.rect = pymupdf.Rect(60, 60, 220, 110) + widget.rect = pymupdf.Rect(60, 60, 230, 120) page.add_widget(widget) source_path = _write_pdf(tmp_path / "signature.pdf", configure) @@ -244,7 +632,9 @@ def configure(_doc: pymupdf.Document, page: pymupdf.Page) -> None: page_text = page.get_text() assert list(page.widgets() or []) == [] - assert page.get_image_info() == [] + assert page.get_image_info() != [] + assert "FIRMADO DIGITALMENTE" in page_text + assert "05/02/2025 14:17" in page_text assert "Ana Perez" not in page_text assert "" in page_text