diff --git a/aymurai/api/endpoints/routers/anonymizer/anonymizer.py b/aymurai/api/endpoints/routers/anonymizer/anonymizer.py index 6bc7b49..6fde1db 100644 --- a/aymurai/api/endpoints/routers/anonymizer/anonymizer.py +++ b/aymurai/api/endpoints/routers/anonymizer/anonymizer.py @@ -89,30 +89,54 @@ def _entities_to_doclabels(entities: list[dict]) -> list[DocLabel]: def _dedupe_doclabels(labels: Iterable[DocLabel]) -> list[DocLabel]: """ - Remove exact duplicate labels while preserving first-seen order. + Merge duplicate labels for the same span and AymurAI label. Args: labels (Iterable[DocLabel]): An iterable of DocLabel objects, potentially containing duplicates. Returns: - list[DocLabel]: A list of DocLabel objects with duplicates removed, + list[DocLabel]: A list of DocLabel objects with duplicates merged, preserving the order of first occurrence. """ deduped: list[DocLabel] = [] - seen: set[str] = set() + index_by_key: dict[tuple[int, int, str], int] = {} for label in labels: - key = json.dumps( - label.model_dump(mode="json", exclude_none=True), - sort_keys=True, - separators=(",", ":"), + key = ( + label.start_char, + label.end_char, + label.attrs.aymurai_label if label.attrs else "", ) - if key in seen: + existing_index = index_by_key.get(key) + if existing_index is None: + index_by_key[key] = len(deduped) + deduped.append(label) continue - seen.add(key) - deduped.append(label) + existing = deduped[existing_index] + existing_data = existing.model_dump(mode="json") + incoming_data = label.model_dump(mode="json") + existing_attrs = existing_data.get("attrs") or {} + incoming_attrs = incoming_data.get("attrs") or {} + + for attr_key, incoming_value in incoming_attrs.items(): + existing_value = existing_attrs.get(attr_key) + if attr_key == "aymurai_label_subclass": + merged = list(existing_value or []) + for subclass in incoming_value or []: + if subclass not in merged: + merged.append(subclass) + existing_attrs[attr_key] = merged + elif existing_value in (None, [], "") and incoming_value not in ( + None, + [], + "", + ): + existing_attrs[attr_key] = incoming_value + + existing_data["attrs"] = existing_attrs + deduped[existing_index] = DocLabel.model_validate(existing_data) return deduped diff --git a/aymurai/transforms/anonymization_postprocess/core.py b/aymurai/transforms/anonymization_postprocess/core.py index bd596d2..af5185e 100644 --- a/aymurai/transforms/anonymization_postprocess/core.py +++ b/aymurai/transforms/anonymization_postprocess/core.py @@ -4,6 +4,7 @@ from aymurai.meta.pipeline_interfaces import Transform from aymurai.meta.types import DataItem from aymurai.utils.misc import get_element +from aymurai.transforms.anonymization_postprocess.exact_labels import EXACT_LABELS class AnonymizationEntityCleaner(Transform): @@ -32,6 +33,7 @@ def process(self, ent: dict) -> dict: original_text = ent["text"] start_char = ent["start_char"] end_char = ent["end_char"] + label = ent["attrs"]["aymurai_label"] # Match leading and trailing non-alphanumeric characters leading_match = re.match(r"^\W+", original_text) @@ -47,10 +49,24 @@ def process(self, ent: dict) -> dict: if not cleaned_text: return None + raw_subclass = ent["attrs"]["aymurai_label_subclass"] + if isinstance(raw_subclass, list): + aymurai_label_subclass = raw_subclass.copy() + elif raw_subclass: + aymurai_label_subclass = [raw_subclass] + else: + aymurai_label_subclass = [] + + if label in EXACT_LABELS: + flattened_text = re.sub(r"[^a-zA-Z0-9]", "", cleaned_text) + if flattened_text and flattened_text not in aymurai_label_subclass: + aymurai_label_subclass.append(flattened_text) + # Update the entity's alt text and indices ent["attrs"]["aymurai_alt_text"] = cleaned_text ent["attrs"]["aymurai_alt_start_char"] = start_char + leading_chars_removed ent["attrs"]["aymurai_alt_end_char"] = end_char - trailing_chars_removed + ent["attrs"]["aymurai_label_subclass"] = aymurai_label_subclass return ent diff --git a/aymurai/transforms/anonymization_postprocess/exact_labels.py b/aymurai/transforms/anonymization_postprocess/exact_labels.py new file mode 100644 index 0000000..afe345d --- /dev/null +++ b/aymurai/transforms/anonymization_postprocess/exact_labels.py @@ -0,0 +1,10 @@ +EXACT_LABELS = { + "DNI", + "CUIT_CUIL", + "TELEFONO", + "PATENTE_DOMINIO", + "IP", + "NUM_CAJA_AHORRO", + "CBU", + "NUM_MATRICULA", +} diff --git a/aymurai/utils/entity_disambiguation/fuzzy.py b/aymurai/utils/entity_disambiguation/fuzzy.py index 682810b..e55850d 100644 --- a/aymurai/utils/entity_disambiguation/fuzzy.py +++ b/aymurai/utils/entity_disambiguation/fuzzy.py @@ -7,6 +7,8 @@ from aymurai.meta.api_interfaces import DocLabel from aymurai.meta.entities import CanonicalEntity +from aymurai.transforms.anonymization_postprocess.exact_labels import EXACT_LABELS + def _find_parent(parent: list[int], idx: int) -> int: """ @@ -179,16 +181,45 @@ def build_canonical_entities( if target_labels and attrs.aymurai_label not in target_labels: continue alias = attrs.aymurai_alt_text or label.text + + subclass_val = getattr(attrs, "aymurai_label_subclass", None) + + if isinstance(subclass_val, list): + exact_alias = subclass_val[-1] if subclass_val else alias + else: + exact_alias = subclass_val or alias + grouped.setdefault(attrs.aymurai_label, []).append( - {"text": alias, "aymurai_label": attrs.aymurai_label} + { + "text": alias, + "aymurai_label": attrs.aymurai_label, + "exact_alias": exact_alias, + } ) canonical_entities: list[CanonicalEntity] = [] - for items in grouped.values(): - clusters = _cluster_aliases_with_cdist( - items=items, - threshold=threshold, - ) + for label_type, items in grouped.items(): + if label_type in EXACT_LABELS: + exact_groups = {} + for item in items: + exact_groups.setdefault(item["exact_alias"], []).append(item) + clusters = [ + [ + ( + item["text"], + str(item["exact_alias"]).lower().strip(), + item["aymurai_label"], + ) + for item in group_items + ] + for group_items in exact_groups.values() + ] + else: + clusters = _cluster_aliases_with_cdist( + items=items, + threshold=threshold, + ) + canonical_entities.extend(_clusters_to_canonical_entities(clusters)) canonical_entities = sorted(canonical_entities, key=lambda x: x.canonical_text) diff --git a/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb b/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb index c072066..92bc818 100644 --- a/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb +++ b/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb @@ -64,7 +64,7 @@ "\n", "print(f\"Found {len(documents)} documents\")\n", "\n", - "doc_path = documents[14]\n", + "doc_path = documents[5]\n", "print(f\"Processing document: {doc_path}\")" ] }, @@ -430,6 +430,13 @@ "result = group_alt_texts_by_entity(anonymize_labels_fuzzy)\n", "print(json.dumps(result, indent=4, ensure_ascii=False))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/tests/api/routers/anonymizer/test_anonymizer.py b/tests/api/routers/anonymizer/test_anonymizer.py index 8225e29..8e3e456 100644 --- a/tests/api/routers/anonymizer/test_anonymizer.py +++ b/tests/api/routers/anonymizer/test_anonymizer.py @@ -545,6 +545,58 @@ def test_should_dedupe_duplicate_labels_when_returning_cached_prediction( mock_load_pipeline.assert_not_called() +@pytest.mark.integration +@patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.load_pipeline") +def test_should_merge_cached_duplicate_labels_for_same_span_and_label( + mock_load_pipeline, client, db_session +): + text = ( + "Víctima: María Paula Trucha, DNI 23.456.789, quien se encuentra " + "conectada con su cámara apagada." + ) + dni_label = build_label("DNI", "23.456.789").model_dump(mode="json") + dni_label.update({"start_char": 33, "end_char": 43}) + dni_label["attrs"].update( + { + "aymurai_alt_text": "23.456.789", + "aymurai_alt_start_char": 33, + "aymurai_alt_end_char": 43, + "aymurai_label_instance": 2, + "aymurai_disambiguation": "fuzzy", + "aymurai_anonymize": True, + "canonical_entity_id": "0bba6d15-1b0c-51f0-b2ca-4fdc8a57cb73", + } + ) + enriched_dni_label = { + **dni_label, + "attrs": { + **dni_label["attrs"], + "aymurai_label_subclass": ["23456789"], + }, + } + + db_session.add( + AnonymizationParagraph( + id=text_to_uuid(text), + text=text, + prediction=[dni_label, enriched_dni_label], + ) + ) + db_session.commit() + + response = client.post( + "/anonymizer/predict", + json={"text": text}, + params={"use_cache": True}, + ) + + assert response.status_code == 200 + data = response.json() + assert data["document"] == text + assert data["labels"] == [enriched_dni_label] + mock_load_pipeline.assert_not_called() + + @pytest.mark.integration @patch( "aymurai.api.endpoints.routers.anonymizer.anonymizer.map_canonical_entities_ner_preds"