AymurAI · jansaldo · May 27, 2026 · May 20, 2026 · May 25, 2026 · May 26, 2026
diff --git a/aymurai/api/endpoints/routers/anonymizer/anonymizer.py b/aymurai/api/endpoints/routers/anonymizer/anonymizer.py
@@ -89,30 +89,54 @@ def _entities_to_doclabels(entities: list[dict]) -> list[DocLabel]:
 
 def _dedupe_doclabels(labels: Iterable[DocLabel]) -> list[DocLabel]:
     """
-    Remove exact duplicate labels while preserving first-seen order.
+    Merge duplicate labels for the same span and AymurAI label.
 
     Args:
         labels (Iterable[DocLabel]): An iterable of DocLabel objects,
             potentially containing duplicates.
 
     Returns:
-        list[DocLabel]: A list of DocLabel objects with duplicates removed,
+        list[DocLabel]: A list of DocLabel objects with duplicates merged,
             preserving the order of first occurrence.
     """
     deduped: list[DocLabel] = []
-    seen: set[str] = set()
+    index_by_key: dict[tuple[int, int, str], int] = {}
 
     for label in labels:
-        key = json.dumps(
-            label.model_dump(mode="json", exclude_none=True),
-            sort_keys=True,
-            separators=(",", ":"),
+        key = (
+            label.start_char,
+            label.end_char,
+            label.attrs.aymurai_label if label.attrs else "",
         )
-        if key in seen:
+        existing_index = index_by_key.get(key)
+        if existing_index is None:
+            index_by_key[key] = len(deduped)
+            deduped.append(label)
             continue
 
-        seen.add(key)
-        deduped.append(label)
+        existing = deduped[existing_index]
+        existing_data = existing.model_dump(mode="json")
+        incoming_data = label.model_dump(mode="json")
+        existing_attrs = existing_data.get("attrs") or {}
+        incoming_attrs = incoming_data.get("attrs") or {}
+
+        for attr_key, incoming_value in incoming_attrs.items():
+            existing_value = existing_attrs.get(attr_key)
+            if attr_key == "aymurai_label_subclass":
+                merged = list(existing_value or [])
+                for subclass in incoming_value or []:
+                    if subclass not in merged:
+                        merged.append(subclass)
+                existing_attrs[attr_key] = merged
+            elif existing_value in (None, [], "") and incoming_value not in (
+                None,
+                [],
+                "",
+            ):
+                existing_attrs[attr_key] = incoming_value
+
+        existing_data["attrs"] = existing_attrs
+        deduped[existing_index] = DocLabel.model_validate(existing_data)
 
     return deduped
 

diff --git a/aymurai/transforms/anonymization_postprocess/core.py b/aymurai/transforms/anonymization_postprocess/core.py
@@ -4,6 +4,7 @@
 from aymurai.meta.pipeline_interfaces import Transform
 from aymurai.meta.types import DataItem
 from aymurai.utils.misc import get_element
+from aymurai.transforms.anonymization_postprocess.exact_labels import EXACT_LABELS
 
 
 class AnonymizationEntityCleaner(Transform):
@@ -32,6 +33,7 @@ def process(self, ent: dict) -> dict:
         original_text = ent["text"]
         start_char = ent["start_char"]
         end_char = ent["end_char"]
+        label = ent["attrs"]["aymurai_label"]
 
         # Match leading and trailing non-alphanumeric characters
         leading_match = re.match(r"^\W+", original_text)
@@ -47,10 +49,24 @@ def process(self, ent: dict) -> dict:
         if not cleaned_text:
             return None
 
+        raw_subclass = ent["attrs"]["aymurai_label_subclass"]
+        if isinstance(raw_subclass, list):
+            aymurai_label_subclass = raw_subclass.copy()
+        elif raw_subclass:
+            aymurai_label_subclass = [raw_subclass]
+        else:
+            aymurai_label_subclass = []
+
+        if label in EXACT_LABELS:
+            flattened_text = re.sub(r"[^a-zA-Z0-9]", "", cleaned_text)
+            if flattened_text and flattened_text not in aymurai_label_subclass:
+                aymurai_label_subclass.append(flattened_text)
+
         # Update the entity's alt text and indices
         ent["attrs"]["aymurai_alt_text"] = cleaned_text
         ent["attrs"]["aymurai_alt_start_char"] = start_char + leading_chars_removed
         ent["attrs"]["aymurai_alt_end_char"] = end_char - trailing_chars_removed
+        ent["attrs"]["aymurai_label_subclass"] = aymurai_label_subclass
 
         return ent
 

diff --git a/aymurai/transforms/anonymization_postprocess/exact_labels.py b/aymurai/transforms/anonymization_postprocess/exact_labels.py
@@ -0,0 +1,10 @@
+EXACT_LABELS = {
+    "DNI",
+    "CUIT_CUIL",
+    "TELEFONO",
+    "PATENTE_DOMINIO",
+    "IP",
+    "NUM_CAJA_AHORRO",
+    "CBU",
+    "NUM_MATRICULA",
+}
diff --git a/aymurai/utils/entity_disambiguation/fuzzy.py b/aymurai/utils/entity_disambiguation/fuzzy.py
@@ -7,6 +7,8 @@
 from aymurai.meta.api_interfaces import DocLabel
 from aymurai.meta.entities import CanonicalEntity
 
+from aymurai.transforms.anonymization_postprocess.exact_labels import EXACT_LABELS
+
 
 def _find_parent(parent: list[int], idx: int) -> int:
     """
@@ -179,16 +181,45 @@ def build_canonical_entities(
         if target_labels and attrs.aymurai_label not in target_labels:
             continue
         alias = attrs.aymurai_alt_text or label.text
+
+        subclass_val = getattr(attrs, "aymurai_label_subclass", None)
+
+        if isinstance(subclass_val, list):
+            exact_alias = subclass_val[-1] if subclass_val else alias
+        else:
+            exact_alias = subclass_val or alias
+
         grouped.setdefault(attrs.aymurai_label, []).append(
-            {"text": alias, "aymurai_label": attrs.aymurai_label}
+            {
+                "text": alias,
+                "aymurai_label": attrs.aymurai_label,
+                "exact_alias": exact_alias,
+            }
         )
 
     canonical_entities: list[CanonicalEntity] = []
-    for items in grouped.values():
-        clusters = _cluster_aliases_with_cdist(
-            items=items,
-            threshold=threshold,
-        )
+    for label_type, items in grouped.items():
+        if label_type in EXACT_LABELS:
+            exact_groups = {}
+            for item in items:
+                exact_groups.setdefault(item["exact_alias"], []).append(item)
+            clusters = [
+                [
+                    (
+                        item["text"],
+                        str(item["exact_alias"]).lower().strip(),
+                        item["aymurai_label"],
+                    )
+                    for item in group_items
+                ]
+                for group_items in exact_groups.values()
+            ]
+        else:
+            clusters = _cluster_aliases_with_cdist(
+                items=items,
+                threshold=threshold,
+            )
+
         canonical_entities.extend(_clusters_to_canonical_entities(clusters))
 
     canonical_entities = sorted(canonical_entities, key=lambda x: x.canonical_text)

diff --git a/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb b/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb
@@ -64,7 +64,7 @@
     "\n",
     "print(f\"Found {len(documents)} documents\")\n",
     "\n",
-    "doc_path = documents[14]\n",
+    "doc_path = documents[5]\n",
     "print(f\"Processing document: {doc_path}\")"
    ]
   },
@@ -430,6 +430,13 @@
     "result = group_alt_texts_by_entity(anonymize_labels_fuzzy)\n",
     "print(json.dumps(result, indent=4, ensure_ascii=False))"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

diff --git a/tests/api/routers/anonymizer/test_anonymizer.py b/tests/api/routers/anonymizer/test_anonymizer.py
@@ -545,6 +545,58 @@ def test_should_dedupe_duplicate_labels_when_returning_cached_prediction(
     mock_load_pipeline.assert_not_called()
 
 
+@pytest.mark.integration
+@patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.load_pipeline")
+def test_should_merge_cached_duplicate_labels_for_same_span_and_label(
+    mock_load_pipeline, client, db_session
+):
+    text = (
+        "Víctima: María Paula Trucha, DNI 23.456.789, quien se encuentra "
+        "conectada con su cámara apagada."
+    )
+    dni_label = build_label("DNI", "23.456.789").model_dump(mode="json")
+    dni_label.update({"start_char": 33, "end_char": 43})
+    dni_label["attrs"].update(
+        {
+            "aymurai_alt_text": "23.456.789",
+            "aymurai_alt_start_char": 33,
+            "aymurai_alt_end_char": 43,
+            "aymurai_label_instance": 2,
+            "aymurai_disambiguation": "fuzzy",
+            "aymurai_anonymize": True,
+            "canonical_entity_id": "0bba6d15-1b0c-51f0-b2ca-4fdc8a57cb73",
+        }
+    )
+    enriched_dni_label = {
+        **dni_label,
+        "attrs": {
+            **dni_label["attrs"],
+            "aymurai_label_subclass": ["23456789"],
+        },
+    }
+
+    db_session.add(
+        AnonymizationParagraph(
+            id=text_to_uuid(text),
+            text=text,
+            prediction=[dni_label, enriched_dni_label],
+        )
+    )
+    db_session.commit()
+
+    response = client.post(
+        "/anonymizer/predict",
+        json={"text": text},
+        params={"use_cache": True},
+    )
+
+    assert response.status_code == 200
+    data = response.json()
+    assert data["document"] == text
+    assert data["labels"] == [enriched_dni_label]
+    mock_load_pipeline.assert_not_called()
+
+
 @pytest.mark.integration
 @patch(
     "aymurai.api.endpoints.routers.anonymizer.anonymizer.map_canonical_entities_ner_preds"