diff --git a/app/colocalization/payload.py b/app/colocalization/payload.py index b0b55d6..34f71ad 100644 --- a/app/colocalization/payload.py +++ b/app/colocalization/payload.py @@ -15,8 +15,8 @@ from app.utils.gtex import get_gtex_snp_matches, gene_names from app.utils import ( get_session_filepath, - parse_region_text, ) +from app.utils.helpers import parse_region_text class SessionFiles: diff --git a/app/colocalization/stages/read_gwas_file.py b/app/colocalization/stages/read_gwas_file.py index d5fa82c..88f233b 100644 --- a/app/colocalization/stages/read_gwas_file.py +++ b/app/colocalization/stages/read_gwas_file.py @@ -229,11 +229,10 @@ def _infer_gwas_columns( self, payload: SessionPayload, gwas_data: pd.DataFrame ) -> pd.DataFrame: """ - Given that the user wishes to infer the variant information using dbSNP, - validate and rename the columns in the DataFrame accordingly, if possible. - - This assumes that the only columns provided in the GWAS file are "SNP" and "P", where "SNP" contains - snp IDs that can be looked up using dbSNP for variant information. + Look up CHROM/POS/REF/ALT from dbSNP based on information in the SNP column, + then validate and rename the columns in the DataFrame accordingly, if possible. + This assumes that "SNP" contains SNP IDs that can be looked up using dbSNP for variant information. + If the dataframe has columns already named CHROM, POS, REF, or ALT, they will be overwritten. Prerequisite: gwas_data columns are set as default values (eg. "SNP", "P", etc.) """ @@ -246,17 +245,25 @@ def _infer_gwas_columns( regionstr = payload.get_locus() variant_list = standardize_snps(list(gwas_data["SNP"]), regionstr, coordinate) + if all(x == "." for x in variant_list): raise InvalidUsage( f"None of the variants provided could be mapped to {regionstr}!", status_code=410, ) + var_df = decompose_variant_list(variant_list) - gwas_data = pd.concat([var_df, gwas_data], axis=1) # add CHROM, POS, REF, ALT - gwas_data = gwas_data.loc[ - [str(x) != "." for x in list(gwas_data["CHROM"])] - ].copy() - gwas_data.reset_index(drop=True, inplace=True) + + gwas_data = gwas_data.reset_index(drop=True).join( + var_df.reset_index(drop=True), lsuffix="_orig" + ) + + gwas_data.drop( + [c for c in gwas_data.columns if c.endswith("_orig")], axis=1, inplace=True + ) + + gwas_data = gwas_data.loc[gwas_data["CHROM"] != "."] + return gwas_data def _validate_gwas_file( diff --git a/app/colocalization/stages/report_gtex_data.py b/app/colocalization/stages/report_gtex_data.py index 4ac79c1..d1e0dc0 100644 --- a/app/colocalization/stages/report_gtex_data.py +++ b/app/colocalization/stages/report_gtex_data.py @@ -51,7 +51,7 @@ def invoke(self, payload: SessionPayload) -> SessionPayload: elif gtex_version == "V10": gene = "ENSG00000174502.18" - snp_list = [asnp.split(";")[0] for asnp in payload.gwas_data_kept["SNP"]] + snp_list = payload.std_snp_list[payload.gwas_indices_kept] if len(gtex_tissues) > 0: for tissue in tqdm(gtex_tissues): diff --git a/app/routes.py b/app/routes.py index b467cb1..b34afcb 100644 --- a/app/routes.py +++ b/app/routes.py @@ -9,7 +9,6 @@ import subprocess from datetime import datetime import re -import pysam import glob import tarfile from typing import Dict, Optional, Tuple, List, Union @@ -34,6 +33,7 @@ from app.utils.gtex import get_gtex, get_gtex_data from app.utils.errors import InvalidUsage, ServerError from app.utils.numpy_encoder import NumpyEncoder +from app.utils.helpers import parse_region_text from app import ext, mongo from app.cache import cache @@ -148,71 +148,6 @@ def __getattribute__(self, name): valid_populations = ["EUR", "AFR", "EAS", "SAS", "AMR", "ASN", "NFE"] -#################################### -# Helper functions -#################################### -def parseRegionText(regiontext, build): - if build not in ["hg19", "hg38"]: - raise InvalidUsage(f"Unrecognized build: {build}", status_code=410) - regiontext = regiontext.strip().replace(" ", "").replace(",", "").replace("chr", "") - if not re.search( - r"^\d+:\d+-\d+$", regiontext.replace("X", "23").replace("x", "23") - ): - raise InvalidUsage( - f"Invalid coordinate format. {regiontext} e.g. 1:205,000,000-206,000,000", - status_code=410, - ) - chrom = regiontext.split(":")[0].lower().replace("chr", "").upper() - pos = regiontext.split(":")[1] - startbp = pos.split("-")[0].replace(",", "") - endbp = pos.split("-")[1].replace(",", "") - chromLengths = pd.read_csv( - os.path.join(app.config["LF_DATA_FOLDER"], build + "_chrom_lengths.txt"), - sep="\t", - encoding="utf-8", - ) - chromLengths.set_index("sequence", inplace=True) - if chrom in ["X", "x"] or chrom == "23": - chrom = 23 - maxChromLength = chromLengths.loc["chrX", "length"] - try: - startbp = int(startbp) - endbp = int(endbp) - except Exception: - raise InvalidUsage( - f"Invalid coordinates input: {regiontext}", status_code=410 - ) - else: - try: - chrom = int(chrom) - if chrom == 23: - maxChromLength = chromLengths.loc["chrX", "length"] - else: - maxChromLength = chromLengths.loc["chr" + str(chrom), "length"] - startbp = int(startbp) - endbp = int(endbp) - except Exception: - raise InvalidUsage( - f"Invalid coordinates input {regiontext}", status_code=410 - ) - if chrom < 1 or chrom > 23: - raise InvalidUsage("Chromosome input must be between 1 and 23", status_code=410) - elif startbp > endbp: - raise InvalidUsage( - "Starting chromosome basepair position is greater than ending basepair position", - status_code=410, - ) - elif startbp > maxChromLength or endbp > maxChromLength: - raise InvalidUsage("Start or end coordinates are out of range", status_code=410) - elif (endbp - startbp) > genomicWindowLimit: - raise InvalidUsage( - f"Entered region size is larger than {genomicWindowLimit / 10**6} Mbp", - status_code=410, - ) - else: - return chrom, startbp, endbp - - def parseSNP(snp_text): """ Extract chrom, position from string of format "chr{chrom}:{position}". @@ -385,85 +320,6 @@ def buildSNPlist(df, chromcol, poscol, refcol, altcol, build): return snplist -def fetchSNV(chrom, bp, ref, build): - variantid = "." - - if ref is None or ref == ".": - ref = "" - - # Ensure valid region: - try: - regiontxt = str(chrom) + ":" + str(bp) + "-" + str(int(bp) + 1) - except Exception: - raise InvalidUsage(f"Invalid input for {str(chrom):str(bp)}") - chrom, startbp, endbp = parseRegionText(regiontxt, build) - chrom = str(chrom).replace("chr", "").replace("23", "X") - - # Load dbSNP151 SNP names from region indicated - dbsnp_filepath = "" - if build.lower() in ["hg38", "grch38"]: - suffix = "b38" - dbsnp_filepath = os.path.join( - app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh38p7", "All_20180418.vcf.gz" - ) - else: - suffix = "b37" - dbsnp_filepath = os.path.join( - app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh37p13", "All_20180423.vcf.gz" - ) - - # Load variant info from dbSNP151 - tbx = pysam.TabixFile(dbsnp_filepath) - varlist = [] - for row in tbx.fetch(str(chrom), bp - 1, bp): - rowlist = str(row).split("\t") - chromi = rowlist[0].replace("chr", "") - posi = rowlist[1] - refi = rowlist[3] - alti = rowlist[4] - varstr = "_".join([chromi, posi, refi, alti, suffix]) - varlist.append(varstr) - - # Check if there is a match to an SNV with the provided info - if len(varlist) == 1: - variantid = varstr - elif len(varlist) > 1 and ref != "": - for v in varlist: - if v.split("_")[2] == ref: - variantid = v - break - return variantid - - -def decomposeVariant(variant_list): - """ - Parameters - ---------- - variantid_list : list - list of str standardized variants in chr_pos_ref_alt_build format - - Returns - ------- - A pandas.dataframe with chromosome, pos, reference and alternate alleles columns - """ - chromlist = [x.split("_")[0] if len(x.split("_")) == 5 else x for x in variant_list] - chromlist = [int(x) if x not in ["X", "."] else x for x in chromlist] - poslist = [ - int(x.split("_")[1]) if len(x.split("_")) == 5 else x for x in variant_list - ] - reflist = [x.split("_")[2] if len(x.split("_")) == 5 else x for x in variant_list] - altlist = [x.split("_")[3] if len(x.split("_")) == 5 else x for x in variant_list] - df = pd.DataFrame( - { - DEFAULT_FORM_VALUE_DICT[FormID.CHROM_COL]: chromlist, - DEFAULT_FORM_VALUE_DICT[FormID.POS_COL]: poslist, - DEFAULT_FORM_VALUE_DICT[FormID.REF_COL]: reflist, - DEFAULT_FORM_VALUE_DICT[FormID.ALT_COL]: altlist, - } - ) - return df - - def addVariantID(gwas_data, chromcol, poscol, refcol, altcol, build="hg19"): """ @@ -508,7 +364,7 @@ def subsetLocus(build, summaryStats, regiontext, chromcol, poscol, pcol): if regiontext == "": regiontext = DEFAULT_FORM_VALUE_DICT[FormID.LOCUS] # print('Parsing region text') - chrom, startbp, endbp = parseRegionText(regiontext, build) + chrom, startbp, endbp = parse_region_text(regiontext, build) summaryStats = summaryStats.loc[ [str(x) != "." for x in list(summaryStats[chromcol])] ].copy() @@ -1147,7 +1003,7 @@ def get_multiple_regions(regionstext: str, build: str) -> List[Tuple[int, int, i regionstext = regionstext.splitlines() regions = [] for regiontext in regionstext: - chrom, start, end = parseRegionText(regiontext, build) + chrom, start, end = parse_region_text(regiontext, build) regions.append((chrom, start, end)) return regions @@ -1729,7 +1585,9 @@ def setbasedtest(): combine_lds = False - snps_used_in_test = [] # List of list of positions, one list per test; position is (chrom, bp) tuple + snps_used_in_test = ( + [] + ) # List of list of positions, one list per test; position is (chrom, bp) tuple # TODO: need to determine used SNPs AFTER tests are performed diff --git a/app/utils/__init__.py b/app/utils/__init__.py index 8e8456b..35438c1 100644 --- a/app/utils/__init__.py +++ b/app/utils/__init__.py @@ -15,6 +15,7 @@ from werkzeug.datastructures import FileStorage from app.utils.errors import InvalidUsage +from app.utils.helpers import parse_region_text from app.utils.variants import get_variants_by_region GENOMIC_WINDOW_LIMIT = 2e6 @@ -107,6 +108,36 @@ def decompose_variant_list(variant_list): return df +def get_dbnsp_snps(chrom: str, startbp: int, endbp: int, build: str) -> List[str]: + """Fetch SNPs from dbSNP for a certain region and build. + + :return: The SNPs as a list of strings, the first five values are CHROM (no prefix), + POS, ID, REF, ALT + :rtype: List[str] + """ + + if build.lower() in ["hg38", "grch38"]: + dbsnp_filepath = os.path.join( + app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh38p7", "All_20180418.vcf.gz" + ) + else: + dbsnp_filepath = os.path.join( + app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh37p13", "All_20180423.vcf.gz" + ) + + # Load dbSNP file + tbx = pysam.TabixFile(dbsnp_filepath) # type: ignore + + result = [] + + for row in tbx.fetch(str(chrom), startbp, endbp): + rowlist = row.split("\t") + rowlist[0] = rowlist[0].replace("chr", "") + result.append(rowlist) + + return result + + def standardize_snps(variantlist, regiontxt, build): """ Input: Variant names in any of these formats: rsid, chrom_pos_ref_alt, chrom:pos_ref_alt, chrom:pos_ref_alt_b37/b38 @@ -126,60 +157,33 @@ def standardize_snps(variantlist, regiontxt, build): # Ensure valid region: chrom, startbp, endbp = parse_region_text(regiontxt, build) + + variants_df = get_variants_by_region(int(startbp), int(endbp), str(chrom), "V10") + + rsid_colname = ( + "rs_id_dbSNP147_GRCh37p13" + if build.lower() in ["hg38", "grch38"] + else "rs_id_dbSNP151_GRCh38p7" + ) + + suffix = "b38" if build.lower() in ["hg38", "grch38"] else "b37" + chrom = str(chrom).replace("23", "X") + + dbsnp_snps = get_dbnsp_snps(chrom, startbp, endbp, build) + if build.lower() in ["hg19", "grch37"]: raise InvalidUsage( "Cannot standardize SNPs to hg19; GTEx V7 is no longer available." ) - # Lookup variants in GTEx V10 db - variants_df = get_variants_by_region(int(startbp), int(endbp), str(chrom), "V10") - - # Load dbSNP151 SNP names from region indicated - dbsnp_filepath = "" - suffix = "b37" - if build.lower() in ["hg38", "grch38"]: - rsid_colname = "rs_id_dbSNP151_GRCh38p7" - suffix = "b38" - dbsnp_filepath = os.path.join( - app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh38p7", "All_20180418.vcf.gz" - ) - else: - rsid_colname = "rs_id_dbSNP147_GRCh37p13" - suffix = "b37" - dbsnp_filepath = os.path.join( - app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh37p13", "All_20180423.vcf.gz" - ) - - # Load dbSNP file - # delayeddf = delayed(pd.read_csv)(dbsnp_filepath,skiprows=getNumHeaderLines(dbsnp_filepath),sep='\t') - # dbsnp = dd.from_delayed(delayeddf) - - tbx = pysam.TabixFile(dbsnp_filepath) # type: ignore - # print('Compiling list of known variants in the region from dbSNP151') - chromcol = [] - poscol = [] - idcol = [] - refcol = [] - altcol = [] - variantid = [] # in chr_pos_ref_alt_build format rsids = dict( {} ) # a multi-allelic variant rsid (key) can be represented in several variantid formats (values) - for row in tbx.fetch(str(chrom), startbp, endbp): - rowlist = str(row).split("\t") - chromi = rowlist[0].replace("chr", "") - posi = rowlist[1] - idi = rowlist[2] - refi = rowlist[3] - alti = rowlist[4] + for row in dbsnp_snps: + chromi, posi, idi, refi, alti = row[:5] varstr = "_".join([chromi, posi, refi, alti, suffix]) - chromcol.append(chromi) - poscol.append(posi) - idcol.append(idi) - refcol.append(refi) - altcol.append(alti) - variantid.append(varstr) + rsids[idi] = [varstr] altalleles = alti.split( "," @@ -191,7 +195,6 @@ def standardize_snps(variantlist, regiontxt, build): varstr = "_".join([chromi, posi, refi, altalleles[i + 1], suffix]) rsids[idi].append(varstr) - # print('Cleaning and mapping list of variants') variantlist = [ asnp.split(";")[0].replace(":", "_").replace(".", "") for asnp in variantlist ] # cleaning up the SNP names a bit @@ -200,9 +203,7 @@ def standardize_snps(variantlist, regiontxt, build): if variant == "": stdvariantlist.append(".") continue - variantstr = variant.replace("chr", "") - if re.search("^23_", variantstr): - variantstr = variantstr.replace("23_", "X_", 1) + variantstr = variant.replace("chr23_", "chrX_").replace("chr", "") if variantstr.startswith("rs"): try: # Here's the difference from the first function version (we look at GTEx first) @@ -223,7 +224,7 @@ def standardize_snps(variantlist, regiontxt, build): strlist = variantstr.split("_") strlist = list(filter(None, strlist)) # remove empty strings try: - achr, astart, aend = parse_region_text( + achr, astart, _ = parse_region_text( strlist[0] + ":" + strlist[1] + "-" + str(int(strlist[1]) + 1), build, ) @@ -269,68 +270,6 @@ def standardize_snps(variantlist, regiontxt, build): return stdvariantlist -def parse_region_text(regiontext, build): - if build not in ["hg19", "hg38"]: - raise InvalidUsage(f"Unrecognized build: {build}", status_code=410) - regiontext = regiontext.strip().replace(" ", "").replace(",", "").replace("chr", "") - if not re.search( - r"^\d+:\d+-\d+$", regiontext.replace("X", "23").replace("x", "23") - ): - raise InvalidUsage( - f"Invalid coordinate format. '{regiontext}' e.g. 1:205,000,000-206,000,000", - status_code=410, - ) - chrom = regiontext.split(":")[0].lower().replace("chr", "").upper() - pos = regiontext.split(":")[1] - startbp = pos.split("-")[0].replace(",", "") - endbp = pos.split("-")[1].replace(",", "") - chromLengths = pd.read_csv( - os.path.join(app.config["LF_DATA_FOLDER"], build + "_chrom_lengths.txt"), - sep="\t", - encoding="utf-8", - ) - chromLengths.set_index("sequence", inplace=True) - if chrom in ["X", "x"] or chrom == "23": - chrom = 23 - maxChromLength = chromLengths.loc["chrX", "length"] - try: - startbp = int(startbp) - endbp = int(endbp) - except Exception: - raise InvalidUsage( - f"Invalid coordinates input: '{regiontext}'", status_code=410 - ) - else: - try: - chrom = int(chrom) - if chrom == 23: - maxChromLength = chromLengths.loc["chrX", "length"] - else: - maxChromLength = chromLengths.loc["chr" + str(chrom), "length"] - startbp = int(startbp) - endbp = int(endbp) - except Exception: - raise InvalidUsage( - f"Invalid coordinates input '{regiontext}'", status_code=410 - ) - if chrom < 1 or chrom > 23: - raise InvalidUsage("Chromosome input must be between 1 and 23", status_code=410) - elif startbp > endbp: - raise InvalidUsage( - "Starting chromosome basepair position is greater than ending basepair position", - status_code=410, - ) - elif startbp > maxChromLength or endbp > maxChromLength: - raise InvalidUsage("Start or end coordinates are out of range", status_code=410) - elif (endbp - startbp) > GENOMIC_WINDOW_LIMIT: - raise InvalidUsage( - f"Entered region size is larger than {GENOMIC_WINDOW_LIMIT / 1e6} Mbp", - status_code=410, - ) - else: - return chrom, startbp, endbp - - def fetch_snv(chrom, bp, ref, build): variantid = "." @@ -342,31 +281,15 @@ def fetch_snv(chrom, bp, ref, build): regiontxt = str(chrom) + ":" + str(bp) + "-" + str(int(bp) + 1) except Exception: raise InvalidUsage(f"Invalid input for {str(chrom):str(bp)}") - chrom, startbp, endbp = parse_region_text(regiontxt, build) + chrom = parse_region_text(regiontxt, build)[0] chrom = str(chrom).replace("chr", "").replace("23", "X") + suffix = "b37" if build.lower() in ["hg38", "grch38"] else "b37" - # Load dbSNP151 SNP names from region indicated - dbsnp_filepath = "" - if build.lower() in ["hg38", "grch38"]: - suffix = "b38" - dbsnp_filepath = os.path.join( - app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh38p7", "All_20180418.vcf.gz" - ) - else: - suffix = "b37" - dbsnp_filepath = os.path.join( - app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh37p13", "All_20180423.vcf.gz" - ) + dbsnps = get_dbnsp_snps(chrom, bp - 1, bp, build) - # Load variant info from dbSNP151 - tbx = pysam.TabixFile(dbsnp_filepath) # type: ignore varlist = [] - for row in tbx.fetch(str(chrom), bp - 1, bp): - rowlist = str(row).split("\t") - chromi = rowlist[0].replace("chr", "") - posi = rowlist[1] - refi = rowlist[3] - alti = rowlist[4] + for row in dbsnps: + chromi, posi, _, refi, alti = row[:5] varstr = "_".join([chromi, posi, refi, alti, suffix]) varlist.append(varstr) diff --git a/app/utils/gencode.py b/app/utils/gencode.py index 31e26b1..d3a5c7f 100644 --- a/app/utils/gencode.py +++ b/app/utils/gencode.py @@ -6,7 +6,7 @@ from typing import List import pandas as pd -from app.utils import parse_region_text +from app.utils.helpers import parse_region_text # /app/utils/gencode.py -> /data/ DATA_FOLDER = os.path.abspath( diff --git a/app/utils/helpers.py b/app/utils/helpers.py index 73a8529..6da48e3 100644 --- a/app/utils/helpers.py +++ b/app/utils/helpers.py @@ -1,7 +1,14 @@ +import os +import re from typing import List +from flask import current_app as app import pandas as pd +from app.utils.errors import InvalidUsage + +GENOMIC_WINDOW_LIMIT = 2e6 + def validate_chromosome( chr: str | int, prefix: str | None = "chr", x_y_numeric: bool = False @@ -109,3 +116,65 @@ def adjust_snp_column( ) return snps_df + + +def parse_region_text(regiontext, build): + if build not in ["hg19", "hg38"]: + raise InvalidUsage(f"Unrecognized build: {build}", status_code=410) + regiontext = regiontext.strip().replace(" ", "").replace(",", "").replace("chr", "") + if not re.search( + r"^\d+:\d+-\d+$", regiontext.replace("X", "23").replace("x", "23") + ): + raise InvalidUsage( + f"Invalid coordinate format. '{regiontext}' e.g. 1:205,000,000-206,000,000", + status_code=410, + ) + chrom = regiontext.split(":")[0].lower().replace("chr", "").upper() + pos = regiontext.split(":")[1] + startbp = pos.split("-")[0].replace(",", "") + endbp = pos.split("-")[1].replace(",", "") + chromLengths = pd.read_csv( + os.path.join(app.config["LF_DATA_FOLDER"], build + "_chrom_lengths.txt"), + sep="\t", + encoding="utf-8", + ) + chromLengths.set_index("sequence", inplace=True) + if chrom in ["X", "x"] or chrom == "23": + chrom = 23 + maxChromLength = chromLengths.loc["chrX", "length"] + try: + startbp = int(startbp) + endbp = int(endbp) + except Exception: + raise InvalidUsage( + f"Invalid coordinates input: '{regiontext}'", status_code=410 + ) + else: + try: + chrom = int(chrom) + if chrom == 23: + maxChromLength = chromLengths.loc["chrX", "length"] + else: + maxChromLength = chromLengths.loc["chr" + str(chrom), "length"] + startbp = int(startbp) + endbp = int(endbp) + except Exception: + raise InvalidUsage( + f"Invalid coordinates input '{regiontext}'", status_code=410 + ) + if chrom < 1 or chrom > 23: + raise InvalidUsage("Chromosome input must be between 1 and 23", status_code=410) + elif startbp > endbp: + raise InvalidUsage( + "Starting chromosome basepair position is greater than ending basepair position", + status_code=410, + ) + elif startbp > maxChromLength or endbp > maxChromLength: + raise InvalidUsage("Start or end coordinates are out of range", status_code=410) + elif (endbp - startbp) > GENOMIC_WINDOW_LIMIT: + raise InvalidUsage( + f"Entered region size is larger than {GENOMIC_WINDOW_LIMIT/1e6} Mbp", + status_code=410, + ) + else: + return chrom, startbp, endbp