From 49d5c5fb68ef1c78a208af66e0be252ca51e8237 Mon Sep 17 00:00:00 2001
From: conrabeatriz <conrabeatriz@gmail.com>
Date: Wed, 20 May 2026 19:02:58 -0300
Subject: [PATCH 1/5] =?UTF-8?q?=F0=9F=90=9B=20Bug=20fixed=20for=20entities?=
 =?UTF-8?q?=20who=20are=20always=20the=20same=20that=20have=20to=20bypass?=
 =?UTF-8?q?=20the=20fuzzy=20matching=20algorithm.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../anonymization_postprocess/core.py         | 18 ++++++++
 aymurai/utils/entity_disambiguation/fuzzy.py  | 43 ++++++++++++++++---
 .../10-anonymize-document-render-policy.ipynb |  2 +-
 3 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/aymurai/transforms/anonymization_postprocess/core.py b/aymurai/transforms/anonymization_postprocess/core.py
index 977fb109..43df2bf7 100644
--- a/aymurai/transforms/anonymization_postprocess/core.py
+++ b/aymurai/transforms/anonymization_postprocess/core.py
@@ -33,6 +33,7 @@ def process(self, ent: dict) -> dict:
         original_text = ent["text"]
         start_char = ent["start_char"]
         end_char = ent["end_char"]
+        label = ent["attrs"]["aymurai_label"]
 
         # Match leading and trailing non-alphanumeric characters
         leading_match = re.match(r"^\W+", original_text)
@@ -45,6 +46,23 @@ def process(self, ent: dict) -> dict:
         # Clean the text
         cleaned_text = pattern.sub("", original_text)
 
+        exact_labels = {
+            "DNI",
+            "CUIT_CUIL",
+            "TELEFONO",
+            "PATENTE_DOMINIO",
+            "IP",
+            "NUM_CAJA_AHORRO",
+            "CBU",
+            "NUM_MATRICULA",
+        }
+
+        ent["attrs"]["aymurai_label_subclass"] = []
+
+        if label in exact_labels:
+            flattened_text = re.sub(r"[^a-zA-Z0-9]", "", cleaned_text)
+            ent["attrs"]["aymurai_label_subclass"].append(flattened_text)
+
         # Update the entity's alt text and indices
         ent["attrs"]["aymurai_alt_text"] = cleaned_text
         ent["attrs"]["aymurai_alt_start_char"] = start_char + leading_chars_removed
diff --git a/aymurai/utils/entity_disambiguation/fuzzy.py b/aymurai/utils/entity_disambiguation/fuzzy.py
index 682810b6..7255b60f 100644
--- a/aymurai/utils/entity_disambiguation/fuzzy.py
+++ b/aymurai/utils/entity_disambiguation/fuzzy.py
@@ -7,6 +7,17 @@
 from aymurai.meta.api_interfaces import DocLabel
 from aymurai.meta.entities import CanonicalEntity
 
+EXACT_LABELS = {
+    "DNI",
+    "CUIT_CUIL",
+    "TELEFONO",
+    "PATENTE_DOMINIO",
+    "IP",
+    "NUM_CAJA_AHORRO",
+    "CBU",
+    "NUM_MATRICULA",
+}
+
 
 def _find_parent(parent: list[int], idx: int) -> int:
     """
@@ -179,16 +190,36 @@ def build_canonical_entities(
         if target_labels and attrs.aymurai_label not in target_labels:
             continue
         alias = attrs.aymurai_alt_text or label.text
+
+        subclass_val = getattr(attrs, "aymurai_label_subclass", None)
+
+        if isinstance(subclass_val, list):
+            exact_alias = subclass_val[-1] if subclass_val else alias
+        else:
+            exact_alias = subclass_val or alias
+
         grouped.setdefault(attrs.aymurai_label, []).append(
-            {"text": alias, "aymurai_label": attrs.aymurai_label}
+            {
+                "text": alias,
+                "aymurai_label": attrs.aymurai_label,
+                "exact_alias": exact_alias,
+            }
         )
 
     canonical_entities: list[CanonicalEntity] = []
-    for items in grouped.values():
-        clusters = _cluster_aliases_with_cdist(
-            items=items,
-            threshold=threshold,
-        )
+    for label_type, items in grouped.items():
+        if label_type in EXACT_LABELS:
+            exact_groups = {}
+            for item in items:
+                exact_groups.setdefault(item["exact_alias"], []).append(item)
+
+            clusters = list(exact_groups.values())
+        else:
+            clusters = _cluster_aliases_with_cdist(
+                items=items,
+                threshold=threshold,
+            )
+
         canonical_entities.extend(_clusters_to_canonical_entities(clusters))
 
     canonical_entities = sorted(canonical_entities, key=lambda x: x.canonical_text)
diff --git a/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb b/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb
index c072066f..6ea1fba6 100644
--- a/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb
+++ b/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb
@@ -64,7 +64,7 @@
     "\n",
     "print(f\"Found {len(documents)} documents\")\n",
     "\n",
-    "doc_path = documents[14]\n",
+    "doc_path = documents[5]\n",
     "print(f\"Processing document: {doc_path}\")"
    ]
   },

From 6822b329524d9241dd0523f8187b2b0621be099e Mon Sep 17 00:00:00 2001
From: conrabeatriz <conrabeatriz@gmail.com>
Date: Tue, 26 May 2026 18:02:20 -0300
Subject: [PATCH 2/5] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Improved=20structure?=
 =?UTF-8?q?=20following=20copilot=20comments.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../anonymization_postprocess/core.py         | 26 +++++++++----------
 .../anonymization_postprocess/exact_labels.py | 10 +++++++
 aymurai/utils/entity_disambiguation/fuzzy.py  | 24 ++++++++---------
 3 files changed, 34 insertions(+), 26 deletions(-)
 create mode 100644 aymurai/transforms/anonymization_postprocess/exact_labels.py

diff --git a/aymurai/transforms/anonymization_postprocess/core.py b/aymurai/transforms/anonymization_postprocess/core.py
index 7f344007..af5185e4 100644
--- a/aymurai/transforms/anonymization_postprocess/core.py
+++ b/aymurai/transforms/anonymization_postprocess/core.py
@@ -4,6 +4,7 @@
 from aymurai.meta.pipeline_interfaces import Transform
 from aymurai.meta.types import DataItem
 from aymurai.utils.misc import get_element
+from aymurai.transforms.anonymization_postprocess.exact_labels import EXACT_LABELS
 
 
 class AnonymizationEntityCleaner(Transform):
@@ -48,27 +49,24 @@ def process(self, ent: dict) -> dict:
         if not cleaned_text:
             return None
 
-        exact_labels = {
-            "DNI",
-            "CUIT_CUIL",
-            "TELEFONO",
-            "PATENTE_DOMINIO",
-            "IP",
-            "NUM_CAJA_AHORRO",
-            "CBU",
-            "NUM_MATRICULA",
-        }
+        raw_subclass = ent["attrs"]["aymurai_label_subclass"]
+        if isinstance(raw_subclass, list):
+            aymurai_label_subclass = raw_subclass.copy()
+        elif raw_subclass:
+            aymurai_label_subclass = [raw_subclass]
+        else:
+            aymurai_label_subclass = []
 
-        ent["attrs"]["aymurai_label_subclass"] = []
-
-        if label in exact_labels:
+        if label in EXACT_LABELS:
             flattened_text = re.sub(r"[^a-zA-Z0-9]", "", cleaned_text)
-            ent["attrs"]["aymurai_label_subclass"].append(flattened_text)
+            if flattened_text and flattened_text not in aymurai_label_subclass:
+                aymurai_label_subclass.append(flattened_text)
 
         # Update the entity's alt text and indices
         ent["attrs"]["aymurai_alt_text"] = cleaned_text
         ent["attrs"]["aymurai_alt_start_char"] = start_char + leading_chars_removed
         ent["attrs"]["aymurai_alt_end_char"] = end_char - trailing_chars_removed
+        ent["attrs"]["aymurai_label_subclass"] = aymurai_label_subclass
 
         return ent
 
diff --git a/aymurai/transforms/anonymization_postprocess/exact_labels.py b/aymurai/transforms/anonymization_postprocess/exact_labels.py
new file mode 100644
index 00000000..afe345da
--- /dev/null
+++ b/aymurai/transforms/anonymization_postprocess/exact_labels.py
@@ -0,0 +1,10 @@
+EXACT_LABELS = {
+    "DNI",
+    "CUIT_CUIL",
+    "TELEFONO",
+    "PATENTE_DOMINIO",
+    "IP",
+    "NUM_CAJA_AHORRO",
+    "CBU",
+    "NUM_MATRICULA",
+}
diff --git a/aymurai/utils/entity_disambiguation/fuzzy.py b/aymurai/utils/entity_disambiguation/fuzzy.py
index 7255b60f..e55850d1 100644
--- a/aymurai/utils/entity_disambiguation/fuzzy.py
+++ b/aymurai/utils/entity_disambiguation/fuzzy.py
@@ -7,16 +7,7 @@
 from aymurai.meta.api_interfaces import DocLabel
 from aymurai.meta.entities import CanonicalEntity
 
-EXACT_LABELS = {
-    "DNI",
-    "CUIT_CUIL",
-    "TELEFONO",
-    "PATENTE_DOMINIO",
-    "IP",
-    "NUM_CAJA_AHORRO",
-    "CBU",
-    "NUM_MATRICULA",
-}
+from aymurai.transforms.anonymization_postprocess.exact_labels import EXACT_LABELS
 
 
 def _find_parent(parent: list[int], idx: int) -> int:
@@ -212,8 +203,17 @@ def build_canonical_entities(
             exact_groups = {}
             for item in items:
                 exact_groups.setdefault(item["exact_alias"], []).append(item)
-
-            clusters = list(exact_groups.values())
+            clusters = [
+                [
+                    (
+                        item["text"],
+                        str(item["exact_alias"]).lower().strip(),
+                        item["aymurai_label"],
+                    )
+                    for item in group_items
+                ]
+                for group_items in exact_groups.values()
+            ]
         else:
             clusters = _cluster_aliases_with_cdist(
                 items=items,

From c963e3daa70a0d8cd9e3a80a0ed8d695ac2f23bc Mon Sep 17 00:00:00 2001
From: conrabeatriz <conrabeatriz@gmail.com>
Date: Tue, 26 May 2026 18:05:31 -0300
Subject: [PATCH 3/5] =?UTF-8?q?=E2=9A=97=EF=B8=8F=20Experimentation.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../10-anonymize-document-render-policy.ipynb              | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb b/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb
index 6ea1fba6..92bc8185 100644
--- a/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb
+++ b/notebooks/experiments/entity-disambiguation/10-anonymize-document-render-policy.ipynb
@@ -430,6 +430,13 @@
     "result = group_alt_texts_by_entity(anonymize_labels_fuzzy)\n",
     "print(json.dumps(result, indent=4, ensure_ascii=False))"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

From 19603dc3dbdbebb2c3db91947742bc216bb8ba3d Mon Sep 17 00:00:00 2001
From: jansaldo <julianansaldo@gmail.com>
Date: Wed, 27 May 2026 18:51:52 +0000
Subject: [PATCH 4/5] =?UTF-8?q?=F0=9F=90=9B=20Merge=20duplicate=20labels?=
 =?UTF-8?q?=20for=20the=20same=20span=20and=20AymurAI=20label=20in=20=5Fde?=
 =?UTF-8?q?dupe=5Fdoclabels=20function?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../routers/anonymizer/anonymizer.py          | 44 ++++++++++++++-----
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/aymurai/api/endpoints/routers/anonymizer/anonymizer.py b/aymurai/api/endpoints/routers/anonymizer/anonymizer.py
index 6bc7b494..6fde1db7 100644
--- a/aymurai/api/endpoints/routers/anonymizer/anonymizer.py
+++ b/aymurai/api/endpoints/routers/anonymizer/anonymizer.py
@@ -89,30 +89,54 @@ def _entities_to_doclabels(entities: list[dict]) -> list[DocLabel]:
 
 def _dedupe_doclabels(labels: Iterable[DocLabel]) -> list[DocLabel]:
     """
-    Remove exact duplicate labels while preserving first-seen order.
+    Merge duplicate labels for the same span and AymurAI label.
 
     Args:
         labels (Iterable[DocLabel]): An iterable of DocLabel objects,
             potentially containing duplicates.
 
     Returns:
-        list[DocLabel]: A list of DocLabel objects with duplicates removed,
+        list[DocLabel]: A list of DocLabel objects with duplicates merged,
             preserving the order of first occurrence.
     """
     deduped: list[DocLabel] = []
-    seen: set[str] = set()
+    index_by_key: dict[tuple[int, int, str], int] = {}
 
     for label in labels:
-        key = json.dumps(
-            label.model_dump(mode="json", exclude_none=True),
-            sort_keys=True,
-            separators=(",", ":"),
+        key = (
+            label.start_char,
+            label.end_char,
+            label.attrs.aymurai_label if label.attrs else "",
         )
-        if key in seen:
+        existing_index = index_by_key.get(key)
+        if existing_index is None:
+            index_by_key[key] = len(deduped)
+            deduped.append(label)
             continue
 
-        seen.add(key)
-        deduped.append(label)
+        existing = deduped[existing_index]
+        existing_data = existing.model_dump(mode="json")
+        incoming_data = label.model_dump(mode="json")
+        existing_attrs = existing_data.get("attrs") or {}
+        incoming_attrs = incoming_data.get("attrs") or {}
+
+        for attr_key, incoming_value in incoming_attrs.items():
+            existing_value = existing_attrs.get(attr_key)
+            if attr_key == "aymurai_label_subclass":
+                merged = list(existing_value or [])
+                for subclass in incoming_value or []:
+                    if subclass not in merged:
+                        merged.append(subclass)
+                existing_attrs[attr_key] = merged
+            elif existing_value in (None, [], "") and incoming_value not in (
+                None,
+                [],
+                "",
+            ):
+                existing_attrs[attr_key] = incoming_value
+
+        existing_data["attrs"] = existing_attrs
+        deduped[existing_index] = DocLabel.model_validate(existing_data)
 
     return deduped
 

From 221ab6dc70b8d6ddba01c212bfd6f9bc9ec1d292 Mon Sep 17 00:00:00 2001
From: jansaldo <julianansaldo@gmail.com>
Date: Wed, 27 May 2026 18:52:35 +0000
Subject: [PATCH 5/5] =?UTF-8?q?=E2=9C=85=20Add=20integration=20test=20for?=
 =?UTF-8?q?=20merging=20cached=20duplicate=20labels=20for=20the=20same=20s?=
 =?UTF-8?q?pan?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../api/routers/anonymizer/test_anonymizer.py | 52 +++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/tests/api/routers/anonymizer/test_anonymizer.py b/tests/api/routers/anonymizer/test_anonymizer.py
index 8225e29d..8e3e4569 100644
--- a/tests/api/routers/anonymizer/test_anonymizer.py
+++ b/tests/api/routers/anonymizer/test_anonymizer.py
@@ -545,6 +545,58 @@ def test_should_dedupe_duplicate_labels_when_returning_cached_prediction(
     mock_load_pipeline.assert_not_called()
 
 
+@pytest.mark.integration
+@patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.load_pipeline")
+def test_should_merge_cached_duplicate_labels_for_same_span_and_label(
+    mock_load_pipeline, client, db_session
+):
+    text = (
+        "Víctima: María Paula Trucha, DNI 23.456.789, quien se encuentra "
+        "conectada con su cámara apagada."
+    )
+    dni_label = build_label("DNI", "23.456.789").model_dump(mode="json")
+    dni_label.update({"start_char": 33, "end_char": 43})
+    dni_label["attrs"].update(
+        {
+            "aymurai_alt_text": "23.456.789",
+            "aymurai_alt_start_char": 33,
+            "aymurai_alt_end_char": 43,
+            "aymurai_label_instance": 2,
+            "aymurai_disambiguation": "fuzzy",
+            "aymurai_anonymize": True,
+            "canonical_entity_id": "0bba6d15-1b0c-51f0-b2ca-4fdc8a57cb73",
+        }
+    )
+    enriched_dni_label = {
+        **dni_label,
+        "attrs": {
+            **dni_label["attrs"],
+            "aymurai_label_subclass": ["23456789"],
+        },
+    }
+
+    db_session.add(
+        AnonymizationParagraph(
+            id=text_to_uuid(text),
+            text=text,
+            prediction=[dni_label, enriched_dni_label],
+        )
+    )
+    db_session.commit()
+
+    response = client.post(
+        "/anonymizer/predict",
+        json={"text": text},
+        params={"use_cache": True},
+    )
+
+    assert response.status_code == 200
+    data = response.json()
+    assert data["document"] == text
+    assert data["labels"] == [enriched_dni_label]
+    mock_load_pipeline.assert_not_called()
+
+
 @pytest.mark.integration
 @patch(
     "aymurai.api.endpoints.routers.anonymizer.anonymizer.map_canonical_entities_ner_preds"