Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 34 additions & 10 deletions aymurai/api/endpoints/routers/anonymizer/anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,30 +89,54 @@ def _entities_to_doclabels(entities: list[dict]) -> list[DocLabel]:

def _dedupe_doclabels(labels: Iterable[DocLabel]) -> list[DocLabel]:
"""
Remove exact duplicate labels while preserving first-seen order.
Merge duplicate labels for the same span and AymurAI label.

Args:
labels (Iterable[DocLabel]): An iterable of DocLabel objects,
potentially containing duplicates.

Returns:
list[DocLabel]: A list of DocLabel objects with duplicates removed,
list[DocLabel]: A list of DocLabel objects with duplicates merged,
preserving the order of first occurrence.
"""
deduped: list[DocLabel] = []
seen: set[str] = set()
index_by_key: dict[tuple[int, int, str], int] = {}

for label in labels:
key = json.dumps(
label.model_dump(mode="json", exclude_none=True),
sort_keys=True,
separators=(",", ":"),
key = (
label.start_char,
label.end_char,
label.attrs.aymurai_label if label.attrs else "",
)
if key in seen:
existing_index = index_by_key.get(key)
if existing_index is None:
index_by_key[key] = len(deduped)
deduped.append(label)
continue

seen.add(key)
deduped.append(label)
existing = deduped[existing_index]
existing_data = existing.model_dump(mode="json")
incoming_data = label.model_dump(mode="json")
existing_attrs = existing_data.get("attrs") or {}
incoming_attrs = incoming_data.get("attrs") or {}

for attr_key, incoming_value in incoming_attrs.items():
existing_value = existing_attrs.get(attr_key)
if attr_key == "aymurai_label_subclass":
merged = list(existing_value or [])
for subclass in incoming_value or []:
if subclass not in merged:
merged.append(subclass)
existing_attrs[attr_key] = merged
elif existing_value in (None, [], "") and incoming_value not in (
None,
[],
"",
):
existing_attrs[attr_key] = incoming_value

existing_data["attrs"] = existing_attrs
deduped[existing_index] = DocLabel.model_validate(existing_data)

return deduped

Expand Down
16 changes: 16 additions & 0 deletions aymurai/transforms/anonymization_postprocess/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from aymurai.meta.pipeline_interfaces import Transform
from aymurai.meta.types import DataItem
from aymurai.utils.misc import get_element
from aymurai.transforms.anonymization_postprocess.exact_labels import EXACT_LABELS


class AnonymizationEntityCleaner(Transform):
Expand Down Expand Up @@ -32,6 +33,7 @@ def process(self, ent: dict) -> dict:
original_text = ent["text"]
start_char = ent["start_char"]
end_char = ent["end_char"]
label = ent["attrs"]["aymurai_label"]

# Match leading and trailing non-alphanumeric characters
leading_match = re.match(r"^\W+", original_text)
Expand All @@ -47,10 +49,24 @@ def process(self, ent: dict) -> dict:
if not cleaned_text:
return None

raw_subclass = ent["attrs"]["aymurai_label_subclass"]
if isinstance(raw_subclass, list):
aymurai_label_subclass = raw_subclass.copy()
elif raw_subclass:
aymurai_label_subclass = [raw_subclass]
else:
aymurai_label_subclass = []

if label in EXACT_LABELS:
flattened_text = re.sub(r"[^a-zA-Z0-9]", "", cleaned_text)
if flattened_text and flattened_text not in aymurai_label_subclass:
aymurai_label_subclass.append(flattened_text)

# Update the entity's alt text and indices
ent["attrs"]["aymurai_alt_text"] = cleaned_text
ent["attrs"]["aymurai_alt_start_char"] = start_char + leading_chars_removed
ent["attrs"]["aymurai_alt_end_char"] = end_char - trailing_chars_removed
ent["attrs"]["aymurai_label_subclass"] = aymurai_label_subclass

return ent

Expand Down
10 changes: 10 additions & 0 deletions aymurai/transforms/anonymization_postprocess/exact_labels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
EXACT_LABELS = {
"DNI",
"CUIT_CUIL",
"TELEFONO",
"PATENTE_DOMINIO",
"IP",
"NUM_CAJA_AHORRO",
"CBU",
"NUM_MATRICULA",
}
43 changes: 37 additions & 6 deletions aymurai/utils/entity_disambiguation/fuzzy.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from aymurai.meta.api_interfaces import DocLabel
from aymurai.meta.entities import CanonicalEntity

from aymurai.transforms.anonymization_postprocess.exact_labels import EXACT_LABELS


def _find_parent(parent: list[int], idx: int) -> int:
"""
Expand Down Expand Up @@ -179,16 +181,45 @@ def build_canonical_entities(
if target_labels and attrs.aymurai_label not in target_labels:
continue
alias = attrs.aymurai_alt_text or label.text

subclass_val = getattr(attrs, "aymurai_label_subclass", None)

if isinstance(subclass_val, list):
exact_alias = subclass_val[-1] if subclass_val else alias
else:
exact_alias = subclass_val or alias

grouped.setdefault(attrs.aymurai_label, []).append(
{"text": alias, "aymurai_label": attrs.aymurai_label}
{
"text": alias,
"aymurai_label": attrs.aymurai_label,
"exact_alias": exact_alias,
}
)

canonical_entities: list[CanonicalEntity] = []
for items in grouped.values():
clusters = _cluster_aliases_with_cdist(
items=items,
threshold=threshold,
)
for label_type, items in grouped.items():
if label_type in EXACT_LABELS:
exact_groups = {}
for item in items:
exact_groups.setdefault(item["exact_alias"], []).append(item)
clusters = [
[
(
item["text"],
str(item["exact_alias"]).lower().strip(),
item["aymurai_label"],
)
for item in group_items
]
for group_items in exact_groups.values()
]
else:
clusters = _cluster_aliases_with_cdist(
items=items,
threshold=threshold,
)

canonical_entities.extend(_clusters_to_canonical_entities(clusters))

canonical_entities = sorted(canonical_entities, key=lambda x: x.canonical_text)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
"\n",
"print(f\"Found {len(documents)} documents\")\n",
"\n",
"doc_path = documents[14]\n",
"doc_path = documents[5]\n",
"print(f\"Processing document: {doc_path}\")"
]
},
Expand Down Expand Up @@ -430,6 +430,13 @@
"result = group_alt_texts_by_entity(anonymize_labels_fuzzy)\n",
"print(json.dumps(result, indent=4, ensure_ascii=False))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
52 changes: 52 additions & 0 deletions tests/api/routers/anonymizer/test_anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -545,6 +545,58 @@ def test_should_dedupe_duplicate_labels_when_returning_cached_prediction(
mock_load_pipeline.assert_not_called()


@pytest.mark.integration
@patch("aymurai.api.endpoints.routers.anonymizer.anonymizer.load_pipeline")
def test_should_merge_cached_duplicate_labels_for_same_span_and_label(
mock_load_pipeline, client, db_session
):
text = (
"Víctima: María Paula Trucha, DNI 23.456.789, quien se encuentra "
"conectada con su cámara apagada."
)
dni_label = build_label("DNI", "23.456.789").model_dump(mode="json")
dni_label.update({"start_char": 33, "end_char": 43})
dni_label["attrs"].update(
{
"aymurai_alt_text": "23.456.789",
"aymurai_alt_start_char": 33,
"aymurai_alt_end_char": 43,
"aymurai_label_instance": 2,
"aymurai_disambiguation": "fuzzy",
"aymurai_anonymize": True,
"canonical_entity_id": "0bba6d15-1b0c-51f0-b2ca-4fdc8a57cb73",
}
)
enriched_dni_label = {
**dni_label,
"attrs": {
**dni_label["attrs"],
"aymurai_label_subclass": ["23456789"],
},
}

db_session.add(
AnonymizationParagraph(
id=text_to_uuid(text),
text=text,
prediction=[dni_label, enriched_dni_label],
)
)
db_session.commit()

response = client.post(
"/anonymizer/predict",
json={"text": text},
params={"use_cache": True},
)

assert response.status_code == 200
data = response.json()
assert data["document"] == text
assert data["labels"] == [enriched_dni_label]
mock_load_pipeline.assert_not_called()


@pytest.mark.integration
@patch(
"aymurai.api.endpoints.routers.anonymizer.anonymizer.map_canonical_entities_ner_preds"
Expand Down
Loading