From 49d5c5fb68ef1c78a208af66e0be252ca51e8237 Mon Sep 17 00:00:00 2001 From: conrabeatriz Date: Wed, 20 May 2026 19:02:58 -0300 Subject: [PATCH 1/5] =?UTF-8?q?=F0=9F=90=9B=20Bug=20fixed=20for=20entities?= =?UTF-8?q?=20who=20are=20always=20the=20same=20that=20have=20to=20bypass?= =?UTF-8?q?=20the=20fuzzy=20matching=20algorithm.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../anonymization_postprocess/core.py | 18 ++++++++ aymurai/utils/entity_disambiguation/fuzzy.py | 43 ++++++++++++++++--- .../10-anonymize-document-render-policy.ipynb | 2 +- 3 files changed, 56 insertions(+), 7 deletions(-) diff --git a/aymurai/transforms/anonymization_postprocess/core.py b/aymurai/transforms/anonymization_postprocess/core.py index 977fb109..43df2bf7 100644 --- a/aymurai/transforms/anonymization_postprocess/core.py +++ b/aymurai/transforms/anonymization_postprocess/core.py @@ -33,6 +33,7 @@ def process(self, ent: dict) -> dict: original_text = ent["text"] start_char = ent["start_char"] end_char = ent["end_char"] + label = ent["attrs"]["aymurai_label"] # Match leading and trailing non-alphanumeric characters leading_match = re.match(r"^\W+", original_text) @@ -45,6 +46,23 @@ def process(self, ent: dict) -> dict: # Clean the text cleaned_text = pattern.sub("", original_text) + exact_labels = { + "DNI", + "CUIT_CUIL", + "TELEFONO", + "PATENTE_DOMINIO", + "IP", + "NUM_CAJA_AHORRO", + "CBU", + "NUM_MATRICULA", + } + + ent["attrs"]["aymurai_label_subclass"] = [] + + if label in exact_labels: + flattened_text = re.sub(r"[^a-zA-Z0-9]", "", cleaned_text) + ent["attrs"]["aymurai_label_subclass"].append(flattened_text) + # Update the entity's alt text and indices ent["attrs"]["aymurai_alt_text"] = cleaned_text ent["attrs"]["aymurai_alt_start_char"] = start_char + leading_chars_removed diff --git a/aymurai/utils/entity_disambiguation/fuzzy.py b/aymurai/utils/entity_disambiguation/fuzzy.py index 682810b6..7255b60f 100644 --- a/aymurai/utils/entity_disambiguation/fuzzy.py +++ b/aymurai/utils/entity_disambiguation/fuzzy.py @@ -7,6 +7,17 @@ from aymurai.meta.api_interfaces import DocLabel from aymurai.meta.entities import CanonicalEntity +EXACT_LABELS = { + "DNI", + "CUIT_CUIL", + "TELEFONO", + "PATENTE_DOMINIO", + "IP", + "NUM_CAJA_AHORRO", + "CBU", + "NUM_MATRICULA", +} + def _find_parent(parent: list[int], idx: int) -> int: """ @@ -179,16 +190,36 @@ def build_canonical_entities( if target_labels and attrs.aymurai_label not in target_labels: continue alias = attrs.aymurai_alt_text or label.text + + subclass_val = getattr(attrs, "aymurai_label_subclass", None) + + if isinstance(subclass_val, list): + exact_alias = subclass_val[-1] if subclass_val else alias + else: + exact_alias = subclass_val or alias + grouped.setdefault(attrs.aymurai_label, []).append( - {"text": alias, "aymurai_label": attrs.aymurai_label} + { + "text": alias, + "aymurai_label": attrs.aymurai_label, + "exact_alias": exact_alias, + } ) canonical_entities: list[CanonicalEntity] = [] - for items in grouped.values(): - clusters = _cluster_aliases_with_cdist( - items=items, - threshold=threshold, - ) + for label_type, items in grouped.items(): + if label_type in EXACT_LABELS: + exact_groups = {} + for item in items: + exact_groups.setdefault(item["exact_alias"], []).append(item) + + clusters = list(exact_groups.values()) + else: + clusters = _cluster_aliases_with_cdist( + items=items, + threshold=threshold, + ) + canonical_entities.extend(_clusters_to_canonical_entities(clusters)) canonical_entities = sorted(canonical_entities, key=lambda x: x.canonical_text) diff --git a/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb b/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb index c072066f..6ea1fba6 100644 --- a/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb +++ b/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb @@ -64,7 +64,7 @@ "\n", "print(f\"Found {len(documents)} documents\")\n", "\n", - "doc_path = documents[14]\n", + "doc_path = documents[5]\n", "print(f\"Processing document: {doc_path}\")" ] }, From 6822b329524d9241dd0523f8187b2b0621be099e Mon Sep 17 00:00:00 2001 From: conrabeatriz Date: Tue, 26 May 2026 18:02:20 -0300 Subject: [PATCH 2/5] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Improved=20structure?= =?UTF-8?q?=20following=20copilot=20comments.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../anonymization_postprocess/core.py | 26 +++++++++---------- .../anonymization_postprocess/exact_labels.py | 10 +++++++ aymurai/utils/entity_disambiguation/fuzzy.py | 24 ++++++++--------- 3 files changed, 34 insertions(+), 26 deletions(-) create mode 100644 aymurai/transforms/anonymization_postprocess/exact_labels.py diff --git a/aymurai/transforms/anonymization_postprocess/core.py b/aymurai/transforms/anonymization_postprocess/core.py index 7f344007..af5185e4 100644 --- a/aymurai/transforms/anonymization_postprocess/core.py +++ b/aymurai/transforms/anonymization_postprocess/core.py @@ -4,6 +4,7 @@ from aymurai.meta.pipeline_interfaces import Transform from aymurai.meta.types import DataItem from aymurai.utils.misc import get_element +from aymurai.transforms.anonymization_postprocess.exact_labels import EXACT_LABELS class AnonymizationEntityCleaner(Transform): @@ -48,27 +49,24 @@ def process(self, ent: dict) -> dict: if not cleaned_text: return None - exact_labels = { - "DNI", - "CUIT_CUIL", - "TELEFONO", - "PATENTE_DOMINIO", - "IP", - "NUM_CAJA_AHORRO", - "CBU", - "NUM_MATRICULA", - } + raw_subclass = ent["attrs"]["aymurai_label_subclass"] + if isinstance(raw_subclass, list): + aymurai_label_subclass = raw_subclass.copy() + elif raw_subclass: + aymurai_label_subclass = [raw_subclass] + else: + aymurai_label_subclass = [] - ent["attrs"]["aymurai_label_subclass"] = [] - - if label in exact_labels: + if label in EXACT_LABELS: flattened_text = re.sub(r"[^a-zA-Z0-9]", "", cleaned_text) - ent["attrs"]["aymurai_label_subclass"].append(flattened_text) + if flattened_text and flattened_text not in aymurai_label_subclass: + aymurai_label_subclass.append(flattened_text) # Update the entity's alt text and indices ent["attrs"]["aymurai_alt_text"] = cleaned_text ent["attrs"]["aymurai_alt_start_char"] = start_char + leading_chars_removed ent["attrs"]["aymurai_alt_end_char"] = end_char - trailing_chars_removed + ent["attrs"]["aymurai_label_subclass"] = aymurai_label_subclass return ent diff --git a/aymurai/transforms/anonymization_postprocess/exact_labels.py b/aymurai/transforms/anonymization_postprocess/exact_labels.py new file mode 100644 index 00000000..afe345da --- /dev/null +++ b/aymurai/transforms/anonymization_postprocess/exact_labels.py @@ -0,0 +1,10 @@ +EXACT_LABELS = { + "DNI", + "CUIT_CUIL", + "TELEFONO", + "PATENTE_DOMINIO", + "IP", + "NUM_CAJA_AHORRO", + "CBU", + "NUM_MATRICULA", +} diff --git a/aymurai/utils/entity_disambiguation/fuzzy.py b/aymurai/utils/entity_disambiguation/fuzzy.py index 7255b60f..e55850d1 100644 --- a/aymurai/utils/entity_disambiguation/fuzzy.py +++ b/aymurai/utils/entity_disambiguation/fuzzy.py @@ -7,16 +7,7 @@ from aymurai.meta.api_interfaces import DocLabel from aymurai.meta.entities import CanonicalEntity -EXACT_LABELS = { - "DNI", - "CUIT_CUIL", - "TELEFONO", - "PATENTE_DOMINIO", - "IP", - "NUM_CAJA_AHORRO", - "CBU", - "NUM_MATRICULA", -} +from aymurai.transforms.anonymization_postprocess.exact_labels import EXACT_LABELS def _find_parent(parent: list[int], idx: int) -> int: @@ -212,8 +203,17 @@ def build_canonical_entities( exact_groups = {} for item in items: exact_groups.setdefault(item["exact_alias"], []).append(item) - - clusters = list(exact_groups.values()) + clusters = [ + [ + ( + item["text"], + str(item["exact_alias"]).lower().strip(), + item["aymurai_label"], + ) + for item in group_items + ] + for group_items in exact_groups.values() + ] else: clusters = _cluster_aliases_with_cdist( items=items, From c963e3daa70a0d8cd9e3a80a0ed8d695ac2f23bc Mon Sep 17 00:00:00 2001 From: conrabeatriz Date: Tue, 26 May 2026 18:05:31 -0300 Subject: [PATCH 3/5] =?UTF-8?q?=E2=9A=97=EF=B8=8F=20Experimentation.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../10-anonymize-document-render-policy.ipynb | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb b/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb index 6ea1fba6..92bc8185 100644 --- a/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb +++ b/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb @@ -430,6 +430,13 @@ "result = group_alt_texts_by_entity(anonymize_labels_fuzzy)\n", "print(json.dumps(result, indent=4, ensure_ascii=False))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 19603dc3dbdbebb2c3db91947742bc216bb8ba3d Mon Sep 17 00:00:00 2001 From: jansaldo Date: Wed, 27 May 2026 18:51:52 +0000 Subject: [PATCH 4/5] =?UTF-8?q?=F0=9F=90=9B=20Merge=20duplicate=20labels?= =?UTF-8?q?=20for=20the=20same=20span=20and=20AymurAI=20label=20in=20=5Fde?= =?UTF-8?q?dupe=5Fdoclabels=20function?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../routers/anonymizer/anonymizer.py | 44 ++++++++++++++----- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/aymurai/api/endpoints/routers/anonymizer/anonymizer.py b/aymurai/api/endpoints/routers/anonymizer/anonymizer.py index 6bc7b494..6fde1db7 100644 --- a/aymurai/api/endpoints/routers/anonymizer/anonymizer.py +++ b/aymurai/api/endpoints/routers/anonymizer/anonymizer.py @@ -89,30 +89,54 @@ def _entities_to_doclabels(entities: list[dict]) -> list[DocLabel]: def _dedupe_doclabels(labels: Iterable[DocLabel]) -> list[DocLabel]: """ - Remove exact duplicate labels while preserving first-seen order. + Merge duplicate labels for the same span and AymurAI label. Args: labels (Iterable[DocLabel]): An iterable of DocLabel objects, potentially containing duplicates. Returns: - list[DocLabel]: A list of DocLabel objects with duplicates removed, + list[DocLabel]: A list of DocLabel objects with duplicates merged, preserving the order of first occurrence. """ deduped: list[DocLabel] = [] - seen: set[str] = set() + index_by_key: dict[tuple[int, int, str], int] = {} for label in labels: - key = json.dumps( - label.model_dump(mode="json", exclude_none=True), - sort_keys=True, - separators=(",", ":"), + key = ( + label.start_char, + label.end_char, + label.attrs.aymurai_label if label.attrs else "", ) - if key in seen: + existing_index = index_by_key.get(key) + if existing_index is None: + index_by_key[key] = len(deduped) + deduped.append(label) continue - seen.add(key) - deduped.append(label) + existing = deduped[existing_index] + existing_data = existing.model_dump(mode="json") + incoming_data = label.model_dump(mode="json") + existing_attrs = existing_data.get("attrs") or {} + incoming_attrs = incoming_data.get("attrs") or {} + + for attr_key, incoming_value in incoming_attrs.items(): + existing_value = existing_attrs.get(attr_key) + if attr_key == "aymurai_label_subclass": + merged = list(existing_value or []) + for subclass in incoming_value or []: + if subclass not in merged: + merged.append(subclass) + existing_attrs[attr_key] = merged + elif existing_value in (None, [], "") and incoming_value not in ( + None, + [], + "", + ): + existing_attrs[attr_key] = incoming_value + + existing_data["attrs"] = existing_attrs + deduped[existing_index] = DocLabel.model_validate(existing_data) return deduped From 221ab6dc70b8d6ddba01c212bfd6f9bc9ec1d292 Mon Sep 17 00:00:00 2001 From: jansaldo Date: Wed, 27 May 2026 18:52:35 +0000 Subject: [PATCH 5/5] =?UTF-8?q?=E2=9C=85=20Add=20integration=20test=20for?= =?UTF-8?q?=20merging=20cached=20duplicate=20labels=20for=20the=20same=20s?= =?UTF-8?q?pan?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../api/routers/anonymizer/test_anonymizer.py | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/tests/api/routers/anonymizer/test_anonymizer.py b/tests/api/routers/anonymizer/test_anonymizer.py index 8225e29d..8e3e4569 100644 --- a/tests/api/routers/anonymizer/test_anonymizer.py +++ b/tests/api/routers/anonymizer/test_anonymizer.py @@ -545,6 +545,58 @@ def test_should_dedupe_duplicate_labels_when_returning_cached_prediction( mock_load_pipeline.assert_not_called() +@pytest.mark.integration +@patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.load_pipeline") +def test_should_merge_cached_duplicate_labels_for_same_span_and_label( + mock_load_pipeline, client, db_session +): + text = ( + "Víctima: María Paula Trucha, DNI 23.456.789, quien se encuentra " + "conectada con su cámara apagada." + ) + dni_label = build_label("DNI", "23.456.789").model_dump(mode="json") + dni_label.update({"start_char": 33, "end_char": 43}) + dni_label["attrs"].update( + { + "aymurai_alt_text": "23.456.789", + "aymurai_alt_start_char": 33, + "aymurai_alt_end_char": 43, + "aymurai_label_instance": 2, + "aymurai_disambiguation": "fuzzy", + "aymurai_anonymize": True, + "canonical_entity_id": "0bba6d15-1b0c-51f0-b2ca-4fdc8a57cb73", + } + ) + enriched_dni_label = { + **dni_label, + "attrs": { + **dni_label["attrs"], + "aymurai_label_subclass": ["23456789"], + }, + } + + db_session.add( + AnonymizationParagraph( + id=text_to_uuid(text), + text=text, + prediction=[dni_label, enriched_dni_label], + ) + ) + db_session.commit() + + response = client.post( + "/anonymizer/predict", + json={"text": text}, + params={"use_cache": True}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["document"] == text + assert data["labels"] == [enriched_dni_label] + mock_load_pipeline.assert_not_called() + + @pytest.mark.integration @patch( "aymurai.api.endpoints.routers.anonymizer.anonymizer.map_canonical_entities_ner_preds"