Skip to content

pull resistance mutations for Influenza from flu_resistance repo and generate collections #1272

Description

@fhennig

The BU-ISCIII/flu_resistance repo documents resistance mutations for Influenza subtypes.

Implement in the collection-seeder

We want to add this as another Source in the collection-seeder, similar to how RSV resistance mutations were added in #1198.

The source should generate resistance mutation collections for Influenza subtypes (H1N1, H3N2, B/Vic, H5N1 as applicable). For each subtype/organism, it should create collections grouped by antiviral (e.g. Oseltamivir, Zanamivir, Baloxavir, etc.), with variants named after the resistance level (e.g. 'Resistance', 'Partial resistance').

In the list of all sources, these should be placed after the RSV resistance mutations to keep IDs predictable on first generation.

Helper code

Two helper files from Ivan for loading and converting the BU-ISCIII JSON formats.

isciii_loader.py — loads the various JSON formats into a shared collection structure:

#!/usr/bin/env python3

import json
import re


rx_mutdec_ngs = re.compile(
    r"(?P<gene>[A-Z]+\d?)\:(?:(?:(?P<wild>[A-Z](?:/[A-Z])*)?(?P<pos>\d+)(?P<alt>[A-Z]))|(?:(?P<del>Del)\D*(?P<dstart>\d+)\D*-\D*(?P<dend>\d+)\D*))"
)

broken = {"M2:BM2", "BM2"}


def iscii_ngs_surv_mut_decode(mut):
    for ms in mut["mutations"]:
        if ms in broken:
            print(f"WARNING: skipping broken <{ms}>")
            continue
        rxm = rx_mutdec_ngs.match(ms)
        if rxm is None:
            print(f"cannot parse ngs surv mutation <{ms}>")
            raise ValueError
        mg = rxm.groupdict()
        yield mg


rx_mutdec_inhibit = re.compile(
    r" *(?:(?:(?P<wild>[A-Z](?:/[A-Z])*)?(?P<pos>\d+)(?P<alt>[A-Z]))|(?:(?P<del>Del)\D*(?P<dstart>\d+)\D*[-\u2013\u2212]\D*(?P<dend>\d+)\D*)) *"
)


def iscii_inhibit_mut_decode(mut):
    genes = [
        k.split("_")[1]
        for k in mut.keys()
        if "mutation_" == k[: len("mutation_")] and mut[k] != [""]
    ]
    if len(genes) < 1:
        print(f"Warning: no mutations found for:\n{mut}")

    for g in genes:
        for ms in mut[f"mutation_{g}"]:
            rxm = rx_mutdec_inhibit.match(ms)
            if rxm is None:
                print(f"cannot parse inhibit mutation <{ms}>")
                print(mut)
                raise ValueError
            mg = rxm.groupdict()
            mg["gene"] = g
            yield mg


def iscii_inhibit_resist_decode(mut):
    targets = [
        k.split("_")[0]
        for k in mut.keys()
        if "_resistance_level" == k[-len("_resistance_level") :]
        and mut[k] not in {"?", "?e"}
    ]
    if len(targets) < 1:
        print(f"Warning: no mutations found for:\n{mut}")

    for k in targets:
        lvl_txt = (
            mut[f"{k}_resistance_level"].replace("\u2212", "-").replace("\u2013", "-")
        )
        lvl = mut[f"{k}_resistance_level"].split(" ", 1)
        if len(lvl) < 2:
            if lvl_txt[0].isdigit():
                lvl = ["", lvl_txt]
            else:
                print(f"Cannot parse {k} :", mut[f"{k}_resistance_level"])
                print(mut)
                raise ValueError

        yield {
            "antiviral_resistance": k,
            "resistance_level": lvl[0],
            "fold_change": lvl[1],
        }


def iscii_mut_data_mut_decode(mut):
    genes = {
        k.split("_")[1]
        for k in mut.keys()
        if "mutation_" == k[: len("mutation_")] and mut[k] != ""
    }
    genes -= {"origin"}

    if len(genes) < 1:
        print(f"Warning: no mutations found for:\n{mut}")

    for g in list(genes):
        mstr = mut[f"mutation_{g}"].replace(" ", "").replace("\n", "")
        for ms in mstr.split("+"):
            if ms in broken:
                print(f"WARNING: skipping broken <{ms}>")
                break
            rxm = rx_mutdec_inhibit.match(ms)
            if rxm is None:
                print(f"cannot parse data_mut mutation <{ms}>")
                print(mut)
                raise ValueError
            mg = rxm.groupdict()
            mg["gene"] = g
            yield mg


keep_resist = [
    "antiviral_resistance",
    "resistance_level",
    "min_fold_change",
    "max_fold_change",
]


def iscii_mut_data_resist_decode(mut):
    yield {k: mut[k] for k in keep_resist}


map_h3n2_sub = {
    "": "seasonal",
    "v": "variant",
}


def iscii_parser(collection, mg, mut, mut_decode, resist_decode):
    # print(mg)
    # print(mut)

    mdec = list(mut_decode(mut))

    uniq_genes = set([m["gene"] for m in mdec])
    if len(uniq_genes) != 1:
        print(
            "WARNING: Currently we DO NOT support mutation combinations spread across mutliple segments!\n"
            f"<{uniq_genes}>"
        )
        return None

    mstr = "+".join(
        [
            (
                f"{m['del']} {m['dstart']}-{m['dend']}"
                if "Del" == m["del"]
                else f"{m['wild'][:1]}{m['pos']}{m['alt']}"
            )
            for m in mdec
        ]
    )

    if " S227N" in mut.get("mutation_NA", {}):
        print(mut)
        print(mdec)
        print(mstr)

    tup = None
    if "A" == mg["type"]:
        lineage = map_h3n2_sub[mg["lin"]] if mg["sub"] == "H3N2" else mg["lin"]
        [gene] = uniq_genes
        tup = (mg["type"], mg["sub"], lineage, gene, mstr)
    elif "B" == mg["type"]:
        [gene] = uniq_genes
        tup = (mg["type"], mg["sub"], mg["lin"], gene, mstr)
    else:
        print(f"Bad type <{mg['type']}>")
        raise ValueError
    # print(tup)

    ret = None
    if tup not in collection:
        sort_key = [len(mdec)]
        sort_key += [
            f"{m['dstart']}-" if "Del" == m["del"] else f"{m['pos']}{m['alt']}"
            for m in mdec
        ]
        # print(tuple(sort_key))
        # print("\n")
        ret = {
            "mut": mdec,
            "sort_key": tuple(sort_key),
            "resist": [],
        }
    else:
        ret = collection[tup]

    for r in resist_decode(mut):
        ret["resist"] += [r]
    return {tup: ret}


sub_readers = {
    "ngs_surv": (iscii_ngs_surv_mut_decode, iscii_mut_data_resist_decode),
    "inhibit": (iscii_inhibit_mut_decode, iscii_inhibit_resist_decode),
    "mut_data": (iscii_mut_data_mut_decode, iscii_mut_data_resist_decode),
}

rx_typing = re.compile(
    r"(?:Type )?(?P<type>[AB])(?:\((?P<sub>H\d+N\d+)\)(?P<lin>\w*))?"
)
rx_sentence = re.compile(
    r"Influenza\W+subtypes\W+mentioned\W+in\W+articles:(?P<list>.*)"
)


def load_isciii(data, sub_r="ngs_surv"):
    collection = {}

    for mut in data:
        rxm = rx_typing.match(mut["strain"])
        if rxm is not None:
            mg = rxm.groupdict()
            psd = iscii_parser(
                collection, mg, mut, sub_readers[sub_r][0], sub_readers[sub_r][1]
            )
            if psd is None:
                continue
            collection.update(psd)
            continue

        print(f"WARNING: skipping broken <{mut['strain']}>")
        continue
        # rxm = rx_sentence.match(mut["strain"])
        # if rxm is None:
        # 	print(f"Cannot parse <{mut['strain']}>")
        # 	continue
        # l = rxm.groupdict()['list']
        # n = 0
        # for rxm in rx_typing.finditer(l):
        # 	n += 1
        # 	mg = rxm.groupdict()
        # if 0 == n:
        # 	print(f"Cannot find parseable in <{l}>")
        # 	continue

    return collection

isciii_converter.py — takes a collection from the loader and returns a DataFrame in roughly the same format as the RSV-A/B output:

import pandas as pd


def flurify_isciii(collection, protein, skip_resist=False):
    FluR_format = {
        (k[0], k[1] or "", k[2] or "")
        + d["sort_key"]: {
            "Type": k[0],
            "Subtype": k[1] or "",
            "Lineage": k[2] or (" " if "B" == k[0] else ""),
            "AA": k[4],
            "Comment": (
                ""
                if skip_resist
                else "; ".join(
                    [
                        f"resistance against {r['antiviral_resistance']} "
                        + (
                            f"level {r['resistance_level']} "
                            if r.get("resistance_level", None)
                            else ""
                        )
                        + r.get(
                            "fold_change",
                            (
                                (
                                    f"(<{r['max_fold_change']})"
                                    if "" == r.get("min_fold_change", "")
                                    else (
                                        f"(>{r['min_fold_change']})"
                                        if "" == r.get("max_fold_change", "")
                                        else f"({r['min_fold_change']}-{r['max_fold_change']})"
                                    )
                                )
                                if "min_fold_change" in r
                                else ""
                            ),
                        )
                        for r in d["resist"]
                    ]
                )
                + "."
            ),
        }
        | (
            {"resist_list": d["resist"], "mut_list": d["mut"]}
            if not skip_resist
            else {}
        )
        for k, d in collection.items()
        if protein == k[3]
    }

    return pd.DataFrame(data=[FluR_format[k] for k in sorted(FluR_format.keys())])

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions