Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion app/colocalization/payload.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
from app.utils.gtex import get_gtex_snp_matches, gene_names
from app.utils import (
get_session_filepath,
parse_region_text,
)
from app.utils.helpers import parse_region_text


class SessionFiles:
Expand Down
27 changes: 17 additions & 10 deletions app/colocalization/stages/read_gwas_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,11 +229,10 @@ def _infer_gwas_columns(
self, payload: SessionPayload, gwas_data: pd.DataFrame
) -> pd.DataFrame:
"""
Given that the user wishes to infer the variant information using dbSNP,
validate and rename the columns in the DataFrame accordingly, if possible.

This assumes that the only columns provided in the GWAS file are "SNP" and "P", where "SNP" contains
snp IDs that can be looked up using dbSNP for variant information.
Look up CHROM/POS/REF/ALT from dbSNP based on information in the SNP column,
then validate and rename the columns in the DataFrame accordingly, if possible.
This assumes that "SNP" contains SNP IDs that can be looked up using dbSNP for variant information.
If the dataframe has columns already named CHROM, POS, REF, or ALT, they will be overwritten.

Prerequisite: gwas_data columns are set as default values (eg. "SNP", "P", etc.)
"""
Expand All @@ -246,17 +245,25 @@ def _infer_gwas_columns(
regionstr = payload.get_locus()

variant_list = standardize_snps(list(gwas_data["SNP"]), regionstr, coordinate)

if all(x == "." for x in variant_list):
raise InvalidUsage(
f"None of the variants provided could be mapped to {regionstr}!",
status_code=410,
)

var_df = decompose_variant_list(variant_list)
gwas_data = pd.concat([var_df, gwas_data], axis=1) # add CHROM, POS, REF, ALT
gwas_data = gwas_data.loc[
[str(x) != "." for x in list(gwas_data["CHROM"])]
].copy()
gwas_data.reset_index(drop=True, inplace=True)

gwas_data = gwas_data.reset_index(drop=True).join(
var_df.reset_index(drop=True), lsuffix="_orig"
)

gwas_data.drop(
[c for c in gwas_data.columns if c.endswith("_orig")], axis=1, inplace=True
)

gwas_data = gwas_data.loc[gwas_data["CHROM"] != "."]

return gwas_data

def _validate_gwas_file(
Expand Down
2 changes: 1 addition & 1 deletion app/colocalization/stages/report_gtex_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def invoke(self, payload: SessionPayload) -> SessionPayload:
elif gtex_version == "V10":
gene = "ENSG00000174502.18"

snp_list = [asnp.split(";")[0] for asnp in payload.gwas_data_kept["SNP"]]
snp_list = payload.std_snp_list[payload.gwas_indices_kept]

if len(gtex_tissues) > 0:
for tissue in tqdm(gtex_tissues):
Expand Down
154 changes: 6 additions & 148 deletions app/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import subprocess
from datetime import datetime
import re
import pysam
import glob
import tarfile
from typing import Dict, Optional, Tuple, List, Union
Expand All @@ -34,6 +33,7 @@
from app.utils.gtex import get_gtex, get_gtex_data
from app.utils.errors import InvalidUsage, ServerError
from app.utils.numpy_encoder import NumpyEncoder
from app.utils.helpers import parse_region_text

from app import ext, mongo
from app.cache import cache
Expand Down Expand Up @@ -148,71 +148,6 @@ def __getattribute__(self, name):
valid_populations = ["EUR", "AFR", "EAS", "SAS", "AMR", "ASN", "NFE"]


####################################
# Helper functions
####################################
def parseRegionText(regiontext, build):
if build not in ["hg19", "hg38"]:
raise InvalidUsage(f"Unrecognized build: {build}", status_code=410)
regiontext = regiontext.strip().replace(" ", "").replace(",", "").replace("chr", "")
if not re.search(
r"^\d+:\d+-\d+$", regiontext.replace("X", "23").replace("x", "23")
):
raise InvalidUsage(
f"Invalid coordinate format. {regiontext} e.g. 1:205,000,000-206,000,000",
status_code=410,
)
chrom = regiontext.split(":")[0].lower().replace("chr", "").upper()
pos = regiontext.split(":")[1]
startbp = pos.split("-")[0].replace(",", "")
endbp = pos.split("-")[1].replace(",", "")
chromLengths = pd.read_csv(
os.path.join(app.config["LF_DATA_FOLDER"], build + "_chrom_lengths.txt"),
sep="\t",
encoding="utf-8",
)
chromLengths.set_index("sequence", inplace=True)
if chrom in ["X", "x"] or chrom == "23":
chrom = 23
maxChromLength = chromLengths.loc["chrX", "length"]
try:
startbp = int(startbp)
endbp = int(endbp)
except Exception:
raise InvalidUsage(
f"Invalid coordinates input: {regiontext}", status_code=410
)
else:
try:
chrom = int(chrom)
if chrom == 23:
maxChromLength = chromLengths.loc["chrX", "length"]
else:
maxChromLength = chromLengths.loc["chr" + str(chrom), "length"]
startbp = int(startbp)
endbp = int(endbp)
except Exception:
raise InvalidUsage(
f"Invalid coordinates input {regiontext}", status_code=410
)
if chrom < 1 or chrom > 23:
raise InvalidUsage("Chromosome input must be between 1 and 23", status_code=410)
elif startbp > endbp:
raise InvalidUsage(
"Starting chromosome basepair position is greater than ending basepair position",
status_code=410,
)
elif startbp > maxChromLength or endbp > maxChromLength:
raise InvalidUsage("Start or end coordinates are out of range", status_code=410)
elif (endbp - startbp) > genomicWindowLimit:
raise InvalidUsage(
f"Entered region size is larger than {genomicWindowLimit / 10**6} Mbp",
status_code=410,
)
else:
return chrom, startbp, endbp


def parseSNP(snp_text):
"""
Extract chrom, position from string of format "chr{chrom}:{position}".
Expand Down Expand Up @@ -385,85 +320,6 @@ def buildSNPlist(df, chromcol, poscol, refcol, altcol, build):
return snplist


def fetchSNV(chrom, bp, ref, build):
variantid = "."

if ref is None or ref == ".":
ref = ""

# Ensure valid region:
try:
regiontxt = str(chrom) + ":" + str(bp) + "-" + str(int(bp) + 1)
except Exception:
raise InvalidUsage(f"Invalid input for {str(chrom):str(bp)}")
chrom, startbp, endbp = parseRegionText(regiontxt, build)
chrom = str(chrom).replace("chr", "").replace("23", "X")

# Load dbSNP151 SNP names from region indicated
dbsnp_filepath = ""
if build.lower() in ["hg38", "grch38"]:
suffix = "b38"
dbsnp_filepath = os.path.join(
app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh38p7", "All_20180418.vcf.gz"
)
else:
suffix = "b37"
dbsnp_filepath = os.path.join(
app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh37p13", "All_20180423.vcf.gz"
)

# Load variant info from dbSNP151
tbx = pysam.TabixFile(dbsnp_filepath)
varlist = []
for row in tbx.fetch(str(chrom), bp - 1, bp):
rowlist = str(row).split("\t")
chromi = rowlist[0].replace("chr", "")
posi = rowlist[1]
refi = rowlist[3]
alti = rowlist[4]
varstr = "_".join([chromi, posi, refi, alti, suffix])
varlist.append(varstr)

# Check if there is a match to an SNV with the provided info
if len(varlist) == 1:
variantid = varstr
elif len(varlist) > 1 and ref != "":
for v in varlist:
if v.split("_")[2] == ref:
variantid = v
break
return variantid


def decomposeVariant(variant_list):
"""
Parameters
----------
variantid_list : list
list of str standardized variants in chr_pos_ref_alt_build format

Returns
-------
A pandas.dataframe with chromosome, pos, reference and alternate alleles columns
"""
chromlist = [x.split("_")[0] if len(x.split("_")) == 5 else x for x in variant_list]
chromlist = [int(x) if x not in ["X", "."] else x for x in chromlist]
poslist = [
int(x.split("_")[1]) if len(x.split("_")) == 5 else x for x in variant_list
]
reflist = [x.split("_")[2] if len(x.split("_")) == 5 else x for x in variant_list]
altlist = [x.split("_")[3] if len(x.split("_")) == 5 else x for x in variant_list]
df = pd.DataFrame(
{
DEFAULT_FORM_VALUE_DICT[FormID.CHROM_COL]: chromlist,
DEFAULT_FORM_VALUE_DICT[FormID.POS_COL]: poslist,
DEFAULT_FORM_VALUE_DICT[FormID.REF_COL]: reflist,
DEFAULT_FORM_VALUE_DICT[FormID.ALT_COL]: altlist,
}
)
return df


def addVariantID(gwas_data, chromcol, poscol, refcol, altcol, build="hg19"):
"""

Expand Down Expand Up @@ -508,7 +364,7 @@ def subsetLocus(build, summaryStats, regiontext, chromcol, poscol, pcol):
if regiontext == "":
regiontext = DEFAULT_FORM_VALUE_DICT[FormID.LOCUS]
# print('Parsing region text')
chrom, startbp, endbp = parseRegionText(regiontext, build)
chrom, startbp, endbp = parse_region_text(regiontext, build)
summaryStats = summaryStats.loc[
[str(x) != "." for x in list(summaryStats[chromcol])]
].copy()
Expand Down Expand Up @@ -1147,7 +1003,7 @@ def get_multiple_regions(regionstext: str, build: str) -> List[Tuple[int, int, i
regionstext = regionstext.splitlines()
regions = []
for regiontext in regionstext:
chrom, start, end = parseRegionText(regiontext, build)
chrom, start, end = parse_region_text(regiontext, build)
regions.append((chrom, start, end))
return regions

Expand Down Expand Up @@ -1729,7 +1585,9 @@ def setbasedtest():

combine_lds = False

snps_used_in_test = [] # List of list of positions, one list per test; position is (chrom, bp) tuple
snps_used_in_test = (
[]
) # List of list of positions, one list per test; position is (chrom, bp) tuple

# TODO: need to determine used SNPs AFTER tests are performed

Expand Down
Loading