pull resistance mutations for Influenza from flu_resistance repo and generate collections

The [BU-ISCIII/flu_resistance](https://github.com/BU-ISCIII/flu_resistance) repo documents resistance mutations for Influenza subtypes.

### Implement in the `collection-seeder`

We want to add this as another `Source` in the `collection-seeder`, similar to how RSV resistance mutations were added in #1198.

The source should generate resistance mutation collections for Influenza subtypes (H1N1, H3N2, B/Vic, H5N1 as applicable). For each subtype/organism, it should create collections grouped by antiviral (e.g. Oseltamivir, Zanamivir, Baloxavir, etc.), with variants named after the resistance level (e.g. 'Resistance', 'Partial resistance').

In the list of all sources, these should be placed after the RSV resistance mutations to keep IDs predictable on first generation.

### Helper code

Two helper files from Ivan for loading and converting the BU-ISCIII JSON formats.

**`isciii_loader.py`** — loads the various JSON formats into a shared collection structure:

```python
#!/usr/bin/env python3

import json
import re


rx_mutdec_ngs = re.compile(
    r"(?P<gene>[A-Z]+\d?)\:(?:(?:(?P<wild>[A-Z](?:/[A-Z])*)?(?P<pos>\d+)(?P<alt>[A-Z]))|(?:(?P<del>Del)\D*(?P<dstart>\d+)\D*-\D*(?P<dend>\d+)\D*))"
)

broken = {"M2:BM2", "BM2"}


def iscii_ngs_surv_mut_decode(mut):
    for ms in mut["mutations"]:
        if ms in broken:
            print(f"WARNING: skipping broken <{ms}>")
            continue
        rxm = rx_mutdec_ngs.match(ms)
        if rxm is None:
            print(f"cannot parse ngs surv mutation <{ms}>")
            raise ValueError
        mg = rxm.groupdict()
        yield mg


rx_mutdec_inhibit = re.compile(
    r" *(?:(?:(?P<wild>[A-Z](?:/[A-Z])*)?(?P<pos>\d+)(?P<alt>[A-Z]))|(?:(?P<del>Del)\D*(?P<dstart>\d+)\D*[-\u2013\u2212]\D*(?P<dend>\d+)\D*)) *"
)


def iscii_inhibit_mut_decode(mut):
    genes = [
        k.split("_")[1]
        for k in mut.keys()
        if "mutation_" == k[: len("mutation_")] and mut[k] != [""]
    ]
    if len(genes) < 1:
        print(f"Warning: no mutations found for:\n{mut}")

    for g in genes:
        for ms in mut[f"mutation_{g}"]:
            rxm = rx_mutdec_inhibit.match(ms)
            if rxm is None:
                print(f"cannot parse inhibit mutation <{ms}>")
                print(mut)
                raise ValueError
            mg = rxm.groupdict()
            mg["gene"] = g
            yield mg


def iscii_inhibit_resist_decode(mut):
    targets = [
        k.split("_")[0]
        for k in mut.keys()
        if "_resistance_level" == k[-len("_resistance_level") :]
        and mut[k] not in {"?", "?e"}
    ]
    if len(targets) < 1:
        print(f"Warning: no mutations found for:\n{mut}")

    for k in targets:
        lvl_txt = (
            mut[f"{k}_resistance_level"].replace("\u2212", "-").replace("\u2013", "-")
        )
        lvl = mut[f"{k}_resistance_level"].split(" ", 1)
        if len(lvl) < 2:
            if lvl_txt[0].isdigit():
                lvl = ["", lvl_txt]
            else:
                print(f"Cannot parse {k} :", mut[f"{k}_resistance_level"])
                print(mut)
                raise ValueError

        yield {
            "antiviral_resistance": k,
            "resistance_level": lvl[0],
            "fold_change": lvl[1],
        }


def iscii_mut_data_mut_decode(mut):
    genes = {
        k.split("_")[1]
        for k in mut.keys()
        if "mutation_" == k[: len("mutation_")] and mut[k] != ""
    }
    genes -= {"origin"}

    if len(genes) < 1:
        print(f"Warning: no mutations found for:\n{mut}")

    for g in list(genes):
        mstr = mut[f"mutation_{g}"].replace(" ", "").replace("\n", "")
        for ms in mstr.split("+"):
            if ms in broken:
                print(f"WARNING: skipping broken <{ms}>")
                break
            rxm = rx_mutdec_inhibit.match(ms)
            if rxm is None:
                print(f"cannot parse data_mut mutation <{ms}>")
                print(mut)
                raise ValueError
            mg = rxm.groupdict()
            mg["gene"] = g
            yield mg


keep_resist = [
    "antiviral_resistance",
    "resistance_level",
    "min_fold_change",
    "max_fold_change",
]


def iscii_mut_data_resist_decode(mut):
    yield {k: mut[k] for k in keep_resist}


map_h3n2_sub = {
    "": "seasonal",
    "v": "variant",
}


def iscii_parser(collection, mg, mut, mut_decode, resist_decode):
    # print(mg)
    # print(mut)

    mdec = list(mut_decode(mut))

    uniq_genes = set([m["gene"] for m in mdec])
    if len(uniq_genes) != 1:
        print(
            "WARNING: Currently we DO NOT support mutation combinations spread across mutliple segments!\n"
            f"<{uniq_genes}>"
        )
        return None

    mstr = "+".join(
        [
            (
                f"{m['del']} {m['dstart']}-{m['dend']}"
                if "Del" == m["del"]
                else f"{m['wild'][:1]}{m['pos']}{m['alt']}"
            )
            for m in mdec
        ]
    )

    if " S227N" in mut.get("mutation_NA", {}):
        print(mut)
        print(mdec)
        print(mstr)

    tup = None
    if "A" == mg["type"]:
        lineage = map_h3n2_sub[mg["lin"]] if mg["sub"] == "H3N2" else mg["lin"]
        [gene] = uniq_genes
        tup = (mg["type"], mg["sub"], lineage, gene, mstr)
    elif "B" == mg["type"]:
        [gene] = uniq_genes
        tup = (mg["type"], mg["sub"], mg["lin"], gene, mstr)
    else:
        print(f"Bad type <{mg['type']}>")
        raise ValueError
    # print(tup)

    ret = None
    if tup not in collection:
        sort_key = [len(mdec)]
        sort_key += [
            f"{m['dstart']}-" if "Del" == m["del"] else f"{m['pos']}{m['alt']}"
            for m in mdec
        ]
        # print(tuple(sort_key))
        # print("\n")
        ret = {
            "mut": mdec,
            "sort_key": tuple(sort_key),
            "resist": [],
        }
    else:
        ret = collection[tup]

    for r in resist_decode(mut):
        ret["resist"] += [r]
    return {tup: ret}


sub_readers = {
    "ngs_surv": (iscii_ngs_surv_mut_decode, iscii_mut_data_resist_decode),
    "inhibit": (iscii_inhibit_mut_decode, iscii_inhibit_resist_decode),
    "mut_data": (iscii_mut_data_mut_decode, iscii_mut_data_resist_decode),
}

rx_typing = re.compile(
    r"(?:Type )?(?P<type>[AB])(?:\((?P<sub>H\d+N\d+)\)(?P<lin>\w*))?"
)
rx_sentence = re.compile(
    r"Influenza\W+subtypes\W+mentioned\W+in\W+articles:(?P<list>.*)"
)


def load_isciii(data, sub_r="ngs_surv"):
    collection = {}

    for mut in data:
        rxm = rx_typing.match(mut["strain"])
        if rxm is not None:
            mg = rxm.groupdict()
            psd = iscii_parser(
                collection, mg, mut, sub_readers[sub_r][0], sub_readers[sub_r][1]
            )
            if psd is None:
                continue
            collection.update(psd)
            continue

        print(f"WARNING: skipping broken <{mut['strain']}>")
        continue
        # rxm = rx_sentence.match(mut["strain"])
        # if rxm is None:
        # 	print(f"Cannot parse <{mut['strain']}>")
        # 	continue
        # l = rxm.groupdict()['list']
        # n = 0
        # for rxm in rx_typing.finditer(l):
        # 	n += 1
        # 	mg = rxm.groupdict()
        # if 0 == n:
        # 	print(f"Cannot find parseable in <{l}>")
        # 	continue

    return collection
```

**`isciii_converter.py`** — takes a collection from the loader and returns a DataFrame in roughly the same format as the RSV-A/B output:

```python
import pandas as pd


def flurify_isciii(collection, protein, skip_resist=False):
    FluR_format = {
        (k[0], k[1] or "", k[2] or "")
        + d["sort_key"]: {
            "Type": k[0],
            "Subtype": k[1] or "",
            "Lineage": k[2] or (" " if "B" == k[0] else ""),
            "AA": k[4],
            "Comment": (
                ""
                if skip_resist
                else "; ".join(
                    [
                        f"resistance against {r['antiviral_resistance']} "
                        + (
                            f"level {r['resistance_level']} "
                            if r.get("resistance_level", None)
                            else ""
                        )
                        + r.get(
                            "fold_change",
                            (
                                (
                                    f"(<{r['max_fold_change']})"
                                    if "" == r.get("min_fold_change", "")
                                    else (
                                        f"(>{r['min_fold_change']})"
                                        if "" == r.get("max_fold_change", "")
                                        else f"({r['min_fold_change']}-{r['max_fold_change']})"
                                    )
                                )
                                if "min_fold_change" in r
                                else ""
                            ),
                        )
                        for r in d["resist"]
                    ]
                )
                + "."
            ),
        }
        | (
            {"resist_list": d["resist"], "mut_list": d["mut"]}
            if not skip_resist
            else {}
        )
        for k, d in collection.items()
        if protein == k[3]
    }

    return pd.DataFrame(data=[FluR_format[k] for k in sorted(FluR_format.keys())])
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

pull resistance mutations for Influenza from flu_resistance repo and generate collections #1272

Implement in the `collection-seeder`

Helper code

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Uh oh!

pull resistance mutations for Influenza from flu_resistance repo and generate collections #1272

Description

Implement in the collection-seeder

Helper code

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions

Implement in the `collection-seeder`