The BU-ISCIII/flu_resistance repo documents resistance mutations for Influenza subtypes.
Implement in the collection-seeder
We want to add this as another Source in the collection-seeder, similar to how RSV resistance mutations were added in #1198.
The source should generate resistance mutation collections for Influenza subtypes (H1N1, H3N2, B/Vic, H5N1 as applicable). For each subtype/organism, it should create collections grouped by antiviral (e.g. Oseltamivir, Zanamivir, Baloxavir, etc.), with variants named after the resistance level (e.g. 'Resistance', 'Partial resistance').
In the list of all sources, these should be placed after the RSV resistance mutations to keep IDs predictable on first generation.
Helper code
Two helper files from Ivan for loading and converting the BU-ISCIII JSON formats.
isciii_loader.py — loads the various JSON formats into a shared collection structure:
#!/usr/bin/env python3
import json
import re
rx_mutdec_ngs = re.compile(
r"(?P<gene>[A-Z]+\d?)\:(?:(?:(?P<wild>[A-Z](?:/[A-Z])*)?(?P<pos>\d+)(?P<alt>[A-Z]))|(?:(?P<del>Del)\D*(?P<dstart>\d+)\D*-\D*(?P<dend>\d+)\D*))"
)
broken = {"M2:BM2", "BM2"}
def iscii_ngs_surv_mut_decode(mut):
for ms in mut["mutations"]:
if ms in broken:
print(f"WARNING: skipping broken <{ms}>")
continue
rxm = rx_mutdec_ngs.match(ms)
if rxm is None:
print(f"cannot parse ngs surv mutation <{ms}>")
raise ValueError
mg = rxm.groupdict()
yield mg
rx_mutdec_inhibit = re.compile(
r" *(?:(?:(?P<wild>[A-Z](?:/[A-Z])*)?(?P<pos>\d+)(?P<alt>[A-Z]))|(?:(?P<del>Del)\D*(?P<dstart>\d+)\D*[-\u2013\u2212]\D*(?P<dend>\d+)\D*)) *"
)
def iscii_inhibit_mut_decode(mut):
genes = [
k.split("_")[1]
for k in mut.keys()
if "mutation_" == k[: len("mutation_")] and mut[k] != [""]
]
if len(genes) < 1:
print(f"Warning: no mutations found for:\n{mut}")
for g in genes:
for ms in mut[f"mutation_{g}"]:
rxm = rx_mutdec_inhibit.match(ms)
if rxm is None:
print(f"cannot parse inhibit mutation <{ms}>")
print(mut)
raise ValueError
mg = rxm.groupdict()
mg["gene"] = g
yield mg
def iscii_inhibit_resist_decode(mut):
targets = [
k.split("_")[0]
for k in mut.keys()
if "_resistance_level" == k[-len("_resistance_level") :]
and mut[k] not in {"?", "?e"}
]
if len(targets) < 1:
print(f"Warning: no mutations found for:\n{mut}")
for k in targets:
lvl_txt = (
mut[f"{k}_resistance_level"].replace("\u2212", "-").replace("\u2013", "-")
)
lvl = mut[f"{k}_resistance_level"].split(" ", 1)
if len(lvl) < 2:
if lvl_txt[0].isdigit():
lvl = ["", lvl_txt]
else:
print(f"Cannot parse {k} :", mut[f"{k}_resistance_level"])
print(mut)
raise ValueError
yield {
"antiviral_resistance": k,
"resistance_level": lvl[0],
"fold_change": lvl[1],
}
def iscii_mut_data_mut_decode(mut):
genes = {
k.split("_")[1]
for k in mut.keys()
if "mutation_" == k[: len("mutation_")] and mut[k] != ""
}
genes -= {"origin"}
if len(genes) < 1:
print(f"Warning: no mutations found for:\n{mut}")
for g in list(genes):
mstr = mut[f"mutation_{g}"].replace(" ", "").replace("\n", "")
for ms in mstr.split("+"):
if ms in broken:
print(f"WARNING: skipping broken <{ms}>")
break
rxm = rx_mutdec_inhibit.match(ms)
if rxm is None:
print(f"cannot parse data_mut mutation <{ms}>")
print(mut)
raise ValueError
mg = rxm.groupdict()
mg["gene"] = g
yield mg
keep_resist = [
"antiviral_resistance",
"resistance_level",
"min_fold_change",
"max_fold_change",
]
def iscii_mut_data_resist_decode(mut):
yield {k: mut[k] for k in keep_resist}
map_h3n2_sub = {
"": "seasonal",
"v": "variant",
}
def iscii_parser(collection, mg, mut, mut_decode, resist_decode):
# print(mg)
# print(mut)
mdec = list(mut_decode(mut))
uniq_genes = set([m["gene"] for m in mdec])
if len(uniq_genes) != 1:
print(
"WARNING: Currently we DO NOT support mutation combinations spread across mutliple segments!\n"
f"<{uniq_genes}>"
)
return None
mstr = "+".join(
[
(
f"{m['del']} {m['dstart']}-{m['dend']}"
if "Del" == m["del"]
else f"{m['wild'][:1]}{m['pos']}{m['alt']}"
)
for m in mdec
]
)
if " S227N" in mut.get("mutation_NA", {}):
print(mut)
print(mdec)
print(mstr)
tup = None
if "A" == mg["type"]:
lineage = map_h3n2_sub[mg["lin"]] if mg["sub"] == "H3N2" else mg["lin"]
[gene] = uniq_genes
tup = (mg["type"], mg["sub"], lineage, gene, mstr)
elif "B" == mg["type"]:
[gene] = uniq_genes
tup = (mg["type"], mg["sub"], mg["lin"], gene, mstr)
else:
print(f"Bad type <{mg['type']}>")
raise ValueError
# print(tup)
ret = None
if tup not in collection:
sort_key = [len(mdec)]
sort_key += [
f"{m['dstart']}-" if "Del" == m["del"] else f"{m['pos']}{m['alt']}"
for m in mdec
]
# print(tuple(sort_key))
# print("\n")
ret = {
"mut": mdec,
"sort_key": tuple(sort_key),
"resist": [],
}
else:
ret = collection[tup]
for r in resist_decode(mut):
ret["resist"] += [r]
return {tup: ret}
sub_readers = {
"ngs_surv": (iscii_ngs_surv_mut_decode, iscii_mut_data_resist_decode),
"inhibit": (iscii_inhibit_mut_decode, iscii_inhibit_resist_decode),
"mut_data": (iscii_mut_data_mut_decode, iscii_mut_data_resist_decode),
}
rx_typing = re.compile(
r"(?:Type )?(?P<type>[AB])(?:\((?P<sub>H\d+N\d+)\)(?P<lin>\w*))?"
)
rx_sentence = re.compile(
r"Influenza\W+subtypes\W+mentioned\W+in\W+articles:(?P<list>.*)"
)
def load_isciii(data, sub_r="ngs_surv"):
collection = {}
for mut in data:
rxm = rx_typing.match(mut["strain"])
if rxm is not None:
mg = rxm.groupdict()
psd = iscii_parser(
collection, mg, mut, sub_readers[sub_r][0], sub_readers[sub_r][1]
)
if psd is None:
continue
collection.update(psd)
continue
print(f"WARNING: skipping broken <{mut['strain']}>")
continue
# rxm = rx_sentence.match(mut["strain"])
# if rxm is None:
# print(f"Cannot parse <{mut['strain']}>")
# continue
# l = rxm.groupdict()['list']
# n = 0
# for rxm in rx_typing.finditer(l):
# n += 1
# mg = rxm.groupdict()
# if 0 == n:
# print(f"Cannot find parseable in <{l}>")
# continue
return collection
isciii_converter.py — takes a collection from the loader and returns a DataFrame in roughly the same format as the RSV-A/B output:
import pandas as pd
def flurify_isciii(collection, protein, skip_resist=False):
FluR_format = {
(k[0], k[1] or "", k[2] or "")
+ d["sort_key"]: {
"Type": k[0],
"Subtype": k[1] or "",
"Lineage": k[2] or (" " if "B" == k[0] else ""),
"AA": k[4],
"Comment": (
""
if skip_resist
else "; ".join(
[
f"resistance against {r['antiviral_resistance']} "
+ (
f"level {r['resistance_level']} "
if r.get("resistance_level", None)
else ""
)
+ r.get(
"fold_change",
(
(
f"(<{r['max_fold_change']})"
if "" == r.get("min_fold_change", "")
else (
f"(>{r['min_fold_change']})"
if "" == r.get("max_fold_change", "")
else f"({r['min_fold_change']}-{r['max_fold_change']})"
)
)
if "min_fold_change" in r
else ""
),
)
for r in d["resist"]
]
)
+ "."
),
}
| (
{"resist_list": d["resist"], "mut_list": d["mut"]}
if not skip_resist
else {}
)
for k, d in collection.items()
if protein == k[3]
}
return pd.DataFrame(data=[FluR_format[k] for k in sorted(FluR_format.keys())])
The BU-ISCIII/flu_resistance repo documents resistance mutations for Influenza subtypes.
Implement in the
collection-seederWe want to add this as another
Sourcein thecollection-seeder, similar to how RSV resistance mutations were added in #1198.The source should generate resistance mutation collections for Influenza subtypes (H1N1, H3N2, B/Vic, H5N1 as applicable). For each subtype/organism, it should create collections grouped by antiviral (e.g. Oseltamivir, Zanamivir, Baloxavir, etc.), with variants named after the resistance level (e.g. 'Resistance', 'Partial resistance').
In the list of all sources, these should be placed after the RSV resistance mutations to keep IDs predictable on first generation.
Helper code
Two helper files from Ivan for loading and converting the BU-ISCIII JSON formats.
isciii_loader.py— loads the various JSON formats into a shared collection structure:isciii_converter.py— takes a collection from the loader and returns a DataFrame in roughly the same format as the RSV-A/B output: