From 54e785f1678be5abf3e6c32da506949898995c7b Mon Sep 17 00:00:00 2001 From: cklamann <12862284+cklamann@users.noreply.github.com> Date: Tue, 14 Oct 2025 11:06:00 -0400 Subject: [PATCH 1/5] remove original chrom/pos/ref/alt cols if present when looking up --- app/colocalization/stages/read_gwas_file.py | 82 ++++++++++++++------- 1 file changed, 56 insertions(+), 26 deletions(-) diff --git a/app/colocalization/stages/read_gwas_file.py b/app/colocalization/stages/read_gwas_file.py index 8f2d956..09959ae 100644 --- a/app/colocalization/stages/read_gwas_file.py +++ b/app/colocalization/stages/read_gwas_file.py @@ -5,7 +5,12 @@ from flask import current_app as app from app.colocalization.payload import SessionPayload -from app.utils import get_file_with_ext, standardize_snps, decompose_variant_list, x_to_23 +from app.utils import ( + get_file_with_ext, + standardize_snps, + decompose_variant_list, + x_to_23, +) from app.pipeline import PipelineStage from app.utils.errors import InvalidUsage, ServerError @@ -21,9 +26,12 @@ class GWASColumn: coloc2: bool = False # Required for coloc2? optional: bool = False # Optional column + # (bi-allelic variants are allowed) # (prefix) chrom pos ref alt(s) build -VCF_FORMAT_PATTERN = "^(?:chr)?([0-9]{1,2})_([0-9]+)_([ATCG]+)_([ATCG]+(?:,[ATCG]+)*)_b3(?:7|8)$" +VCF_FORMAT_PATTERN = ( + "^(?:chr)?([0-9]{1,2})_([0-9]+)_([ATCG]+)_([ATCG]+(?:,[ATCG]+)*)_b3(?:7|8)$" +) class ReadGWASFileStage(PipelineStage): @@ -62,7 +70,7 @@ class ReadGWASFileStage(PipelineStage): def name(self) -> str: return "read-gwas-file" - + def description(self) -> str: return "Read GWAS data from file" @@ -86,7 +94,7 @@ def invoke(self, payload: SessionPayload) -> SessionPayload: self._snp_format_check(gwas_data) - payload.gwas_data_original = gwas_data.copy() # ouch + payload.gwas_data_original = gwas_data.copy() # ouch return payload @@ -96,7 +104,9 @@ def _read_gwas_file(self, payload: SessionPayload) -> pd.DataFrame: Save the file and return the dataframe. """ - gwas_filepath = get_file_with_ext(payload.uploaded_files, self.VALID_GWAS_EXTENSIONS) + gwas_filepath = get_file_with_ext( + payload.uploaded_files, self.VALID_GWAS_EXTENSIONS + ) if gwas_filepath is None: raise InvalidUsage( @@ -146,9 +156,9 @@ def _set_gwas_columns( column_input_list.append( payload.request_form.get(column.form_id, column.default) ) - column_mapper[ - payload.request_form.get(column.form_id, column.default) - ] = column.default + column_mapper[payload.request_form.get(column.form_id, column.default)] = ( + column.default + ) column_inputs[column.default] = payload.request_form.get(column.form_id, "") # Column uniqueness check @@ -181,7 +191,9 @@ def _set_gwas_columns( # Get coloc2 if applicable if payload.get_is_coloc2(): - for column in [c.default for c in self.GWAS_COLUMNS if c.coloc2 and not c.optional]: + for column in [ + c.default for c in self.GWAS_COLUMNS if c.coloc2 and not c.optional + ]: if column not in gwas_data.columns: raise InvalidUsage( f"{column} column missing but is required for COLOC2. '{column_inputs[column]}' not in columns '{', '.join(old_gwas_columns)}'" @@ -203,7 +215,13 @@ def _set_gwas_columns( gwas_data = pd.concat([gwas_data, num_cases_df], axis=1) # Drop columns that are not used - gwas_data = gwas_data.drop(columns=[c for c in gwas_data.columns if c not in (c.default for c in self.GWAS_COLUMNS)]) + gwas_data = gwas_data.drop( + columns=[ + c + for c in gwas_data.columns + if c not in (c.default for c in self.GWAS_COLUMNS) + ] + ) return gwas_data @@ -211,11 +229,10 @@ def _infer_gwas_columns( self, payload: SessionPayload, gwas_data: pd.DataFrame ) -> pd.DataFrame: """ - Given that the user wishes to infer the variant information using dbSNP, - validate and rename the columns in the DataFrame accordingly, if possible. - - This assumes that the only columns provided in the GWAS file are "SNP" and "P", where "SNP" contains - snp IDs that can be looked up using dbSNP for variant information. + Look up CHROM/POS/REF/ALT from dbSNP based on information in the SNP column, + then validate and rename the columns in the DataFrame accordingly, if possible. + This assumes that "SNP" contains SNP IDs that can be looked up using dbSNP for variant information. + If the dataframe has columns already named CHROM, POS, REF, or ALT, they will be overwritten. Prerequisite: gwas_data columns are set as default values (eg. "SNP", "P", etc.) """ @@ -228,17 +245,25 @@ def _infer_gwas_columns( regionstr = payload.get_locus() variant_list = standardize_snps(list(gwas_data["SNP"]), regionstr, coordinate) + if all(x == "." for x in variant_list): raise InvalidUsage( f"None of the variants provided could be mapped to {regionstr}!", status_code=410, ) + var_df = decompose_variant_list(variant_list) - gwas_data = pd.concat([var_df, gwas_data], axis=1) # add CHROM, POS, REF, ALT - gwas_data = gwas_data.loc[ - [str(x) != "." for x in list(gwas_data["CHROM"])] - ].copy() - gwas_data.reset_index(drop=True, inplace=True) + + gwas_data = gwas_data.reset_index(drop=True).join( + var_df.reset_index(drop=True), lsuffix="_orig" + ) + + gwas_data.drop( + [c for c in gwas_data.columns if c.endswith("_orig")], axis=1, inplace=True + ) + + gwas_data = gwas_data.loc[gwas_data["CHROM"] != "."] + return gwas_data def _validate_gwas_file( @@ -378,22 +403,27 @@ def _get_std_snp_list( raise ServerError("GWAS data and indices are not in sync") return std_snp_list - + def _snp_format_check(self, gwas_data: pd.DataFrame) -> None: """ Perform a sanity check on the SNP column of the GWAS data. - + Raise an error if the SNP column contains any SNPs with format chr_pos_ref_alt_build have reversed alleles, or other inconsistencies within its row in the dataframe. """ vcf_snps = gwas_data[gwas_data["SNP"].str.contains(VCF_FORMAT_PATTERN)] if len(vcf_snps) > 0: - expanded = gwas_data["SNP"].str.extract(VCF_FORMAT_PATTERN) \ - .rename(columns={0: "CHROM", 1: "POS", 2: "REF", 3: "ALT"}) \ + expanded = ( + gwas_data["SNP"] + .str.extract(VCF_FORMAT_PATTERN) + .rename(columns={0: "CHROM", 1: "POS", 2: "REF", 3: "ALT"}) .astype({"CHROM": int, "POS": int, "REF": str, "ALT": str}) - - merged_vcf_snps = expanded.merge(vcf_snps, how="inner", on=["CHROM", "POS", "REF", "ALT"]) + ) + + merged_vcf_snps = expanded.merge( + vcf_snps, how="inner", on=["CHROM", "POS", "REF", "ALT"] + ) if len(vcf_snps) != len(merged_vcf_snps): raise InvalidUsage( "GWAS data contains SNPs with chrom_pos_ref_alt_build format that are not consistent (eg. reversed alleles, incorrect position or chromosome). " From 522ce31aaa01c3901e55a2491808bda70173b4e4 Mon Sep 17 00:00:00 2001 From: cklamann <12862284+cklamann@users.noreply.github.com> Date: Tue, 14 Oct 2025 11:07:09 -0400 Subject: [PATCH 2/5] use standardize snp list rather than raw column in report_gtex_data.py --- app/colocalization/stages/report_gtex_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/colocalization/stages/report_gtex_data.py b/app/colocalization/stages/report_gtex_data.py index a3d4a0b..34e157f 100644 --- a/app/colocalization/stages/report_gtex_data.py +++ b/app/colocalization/stages/report_gtex_data.py @@ -45,7 +45,7 @@ def invoke(self, payload: SessionPayload) -> SessionPayload: elif gtex_version == "V8": gene = "ENSG00000174502.18" - snp_list = [asnp.split(";")[0] for asnp in payload.gwas_data["SNP"]] + snp_list = payload.std_snp_list if len(gtex_tissues) > 0: for tissue in tqdm(gtex_tissues): From 3068fafe84bf75326173d2bd6316ef57b5013377 Mon Sep 17 00:00:00 2001 From: cklamann <12862284+cklamann@users.noreply.github.com> Date: Tue, 14 Oct 2025 11:54:45 -0400 Subject: [PATCH 3/5] remove unused code from routes.py, break out dbsnp and gtex snp fetches into their own functions --- app/colocalization/payload.py | 12 +- app/routes.py | 481 ++-------------------------------- app/utils/__init__.py | 212 +++++---------- app/utils/gencode.py | 6 +- app/utils/gtex.py | 31 ++- app/utils/helpers.py | 117 ++++++++- 6 files changed, 216 insertions(+), 643 deletions(-) diff --git a/app/colocalization/payload.py b/app/colocalization/payload.py index 84bf498..e5348fb 100644 --- a/app/colocalization/payload.py +++ b/app/colocalization/payload.py @@ -16,8 +16,8 @@ from app.utils.gtex import get_gtex_snp_matches, gene_names from app.utils import ( get_session_filepath, - parse_region_text, ) +from app.utils.helpers import parse_region_text class SessionFiles: @@ -188,7 +188,10 @@ def get_secondary_coordinate(self) -> str: Return either 'hg19' or 'hg38'. If value is 'gwas', then get_coordinate() is used. """ - if self.request_form.get("html-file-coordinate") not in [*VALID_COORDINATES, "gwas"]: + if self.request_form.get("html-file-coordinate") not in [ + *VALID_COORDINATES, + "gwas", + ]: raise InvalidUsage( f"Invalid secondary dataset coordinate: '{self.request_form.get('html-file-coordinate')}'" ) @@ -434,7 +437,10 @@ def report_liftover(self) -> dict: if self.secondary_datasets_unlifted_indices is not None: data["secondary_datasets_unlifted"] = {} - for table_title, indices in self.secondary_datasets_unlifted_indices.items(): + for ( + table_title, + indices, + ) in self.secondary_datasets_unlifted_indices.items(): data["secondary_datasets_unlifted"][table_title] = indices return data diff --git a/app/routes.py b/app/routes.py index 76cfd40..d373164 100644 --- a/app/routes.py +++ b/app/routes.py @@ -9,7 +9,6 @@ import subprocess from datetime import datetime import re -import pysam import glob import tarfile from typing import Dict, Optional, Tuple, List, Union @@ -29,12 +28,13 @@ from pymongo.errors import ConnectionFailure from app.tasks import get_is_celery_running, run_pipeline_async -from app.utils import download_file +from app.utils import download_file, get_dbnsp_snps from app.utils.gencode import get_genes_by_location from app.utils.gtex import get_gtex, get_gtex_data from app.utils.apis.gtex import get_tissue_site_details from app.utils.errors import InvalidUsage, ServerError from app.utils.numpy_encoder import NumpyEncoder +from app.utils.helpers import parse_region_text from app import ext, mongo from app.cache import cache @@ -149,71 +149,6 @@ def __getattribute__(self, name): valid_populations = ["EUR", "AFR", "EAS", "SAS", "AMR", "ASN", "NFE"] -#################################### -# Helper functions -#################################### -def parseRegionText(regiontext, build): - if build not in ["hg19", "hg38"]: - raise InvalidUsage(f"Unrecognized build: {build}", status_code=410) - regiontext = regiontext.strip().replace(" ", "").replace(",", "").replace("chr", "") - if not re.search( - r"^\d+:\d+-\d+$", regiontext.replace("X", "23").replace("x", "23") - ): - raise InvalidUsage( - f"Invalid coordinate format. {regiontext} e.g. 1:205,000,000-206,000,000", - status_code=410, - ) - chrom = regiontext.split(":")[0].lower().replace("chr", "").upper() - pos = regiontext.split(":")[1] - startbp = pos.split("-")[0].replace(",", "") - endbp = pos.split("-")[1].replace(",", "") - chromLengths = pd.read_csv( - os.path.join(app.config["LF_DATA_FOLDER"], build + "_chrom_lengths.txt"), - sep="\t", - encoding="utf-8", - ) - chromLengths.set_index("sequence", inplace=True) - if chrom in ["X", "x"] or chrom == "23": - chrom = 23 - maxChromLength = chromLengths.loc["chrX", "length"] - try: - startbp = int(startbp) - endbp = int(endbp) - except: - raise InvalidUsage( - f"Invalid coordinates input: {regiontext}", status_code=410 - ) - else: - try: - chrom = int(chrom) - if chrom == 23: - maxChromLength = chromLengths.loc["chrX", "length"] - else: - maxChromLength = chromLengths.loc["chr" + str(chrom), "length"] - startbp = int(startbp) - endbp = int(endbp) - except: - raise InvalidUsage( - f"Invalid coordinates input {regiontext}", status_code=410 - ) - if chrom < 1 or chrom > 23: - raise InvalidUsage("Chromosome input must be between 1 and 23", status_code=410) - elif startbp > endbp: - raise InvalidUsage( - "Starting chromosome basepair position is greater than ending basepair position", - status_code=410, - ) - elif startbp > maxChromLength or endbp > maxChromLength: - raise InvalidUsage("Start or end coordinates are out of range", status_code=410) - elif (endbp - startbp) > genomicWindowLimit: - raise InvalidUsage( - f"Entered region size is larger than {genomicWindowLimit/10**6} Mbp", - status_code=410, - ) - else: - return chrom, startbp, endbp - - def parseSNP(snp_text): """ Extract chrom, position from string of format "chr{chrom}:{position}". @@ -386,370 +321,6 @@ def buildSNPlist(df, chromcol, poscol, refcol, altcol, build): return snplist -def fetchSNV(chrom, bp, ref, build): - variantid = "." - - if ref is None or ref == ".": - ref = "" - - # Ensure valid region: - try: - regiontxt = str(chrom) + ":" + str(bp) + "-" + str(int(bp) + 1) - except: - raise InvalidUsage(f"Invalid input for {str(chrom):str(bp)}") - chrom, startbp, endbp = parseRegionText(regiontxt, build) - chrom = str(chrom).replace("chr", "").replace("23", "X") - - # Load dbSNP151 SNP names from region indicated - dbsnp_filepath = "" - if build.lower() in ["hg38", "grch38"]: - suffix = "b38" - dbsnp_filepath = os.path.join( - app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh38p7", "All_20180418.vcf.gz" - ) - else: - suffix = "b37" - dbsnp_filepath = os.path.join( - app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh37p13", "All_20180423.vcf.gz" - ) - - # Load variant info from dbSNP151 - tbx = pysam.TabixFile(dbsnp_filepath) - varlist = [] - for row in tbx.fetch(str(chrom), bp - 1, bp): - rowlist = str(row).split("\t") - chromi = rowlist[0].replace("chr", "") - posi = rowlist[1] - idi = rowlist[2] - refi = rowlist[3] - alti = rowlist[4] - varstr = "_".join([chromi, posi, refi, alti, suffix]) - varlist.append(varstr) - - # Check if there is a match to an SNV with the provided info - if len(varlist) == 1: - variantid = varstr - elif len(varlist) > 1 and ref != "": - for v in varlist: - if v.split("_")[2] == ref: - variantid = v - break - return variantid - - -def standardizeSNPs(variantlist, regiontxt, build): - """ - Input: Variant names in any of these formats: rsid, chrom_pos_ref_alt, chrom:pos_ref_alt, chrom:pos_ref_alt_b37/b38 - Output: chrom_pos_ref_alt_b37/b38 variant ID format, but looks at GTEx variant lookup table first. - In the case of multi-allelic variants (e.g. rs2211330(T/A,C)), formats such as 1_205001063_T_A,C_b37 are accepted - If variant ID format is chr:pos, and the chr:pos has a unique biallelic SNV, then it will be assigned that variant - """ - - if all(x == "." for x in variantlist): - raise InvalidUsage("No variants provided") - - if np.nan in variantlist: - raise InvalidUsage( - "Missing variant IDs detected in row(s): " - + str([i + 1 for i, x in enumerate(variantlist) if str(x) == "nan"]) - ) - - # Ensure valid region: - chrom, startbp, endbp = parseRegionText(regiontxt, build) - chrom = str(chrom).replace("23", "X") - - # Load GTEx variant lookup table for region indicated - db = client.GTEx_V7 - rsid_colname = "rs_id_dbSNP147_GRCh37p13" - if build.lower() in ["hg38", "grch38"]: - db = client.GTEx_V8 - rsid_colname = "rs_id_dbSNP151_GRCh38p7" - collection = db["variant_table"] - variants_query = collection.find( - { - "$and": [ - {"chr": int(chrom.replace("X", "23"))}, - {"variant_pos": {"$gte": int(startbp), "$lte": int(endbp)}}, - ] - } - ) - variants_list = list(variants_query) - variants_df = pd.DataFrame(variants_list) - variants_df = variants_df.drop(["_id"], axis=1) - - # Load dbSNP151 SNP names from region indicated - dbsnp_filepath = "" - suffix = "b37" - if build.lower() in ["hg38", "grch38"]: - suffix = "b38" - dbsnp_filepath = os.path.join( - app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh38p7", "All_20180418.vcf.gz" - ) - else: - suffix = "b37" - dbsnp_filepath = os.path.join( - app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh37p13", "All_20180423.vcf.gz" - ) - - # Load dbSNP file - # delayeddf = delayed(pd.read_csv)(dbsnp_filepath,skiprows=getNumHeaderLines(dbsnp_filepath),sep='\t') - # dbsnp = dd.from_delayed(delayeddf) - tbx = pysam.TabixFile(dbsnp_filepath) - # print('Compiling list of known variants in the region from dbSNP151') - chromcol = [] - poscol = [] - idcol = [] - refcol = [] - altcol = [] - variantid = [] # in chr_pos_ref_alt_build format - rsids = dict( - {} - ) # a multi-allelic variant rsid (key) can be represented in several variantid formats (values) - for row in tbx.fetch(str(chrom), startbp, endbp): - rowlist = str(row).split("\t") - chromi = rowlist[0].replace("chr", "") - posi = rowlist[1] - idi = rowlist[2] - refi = rowlist[3] - alti = rowlist[4] - varstr = "_".join([chromi, posi, refi, alti, suffix]) - chromcol.append(chromi) - poscol.append(posi) - idcol.append(idi) - refcol.append(refi) - altcol.append(alti) - variantid.append(varstr) - rsids[idi] = [varstr] - altalleles = alti.split( - "," - ) # could have more than one alt allele (multi-allelic) - if len(altalleles) > 1: - varstr = "_".join([chromi, posi, refi, altalleles[0], suffix]) - rsids[idi].append(varstr) - for i in np.arange(len(altalleles) - 1): - varstr = "_".join([chromi, posi, refi, altalleles[i + 1], suffix]) - rsids[idi].append(varstr) - - # print('Cleaning and mapping list of variants') - variantlist = [ - asnp.split(";")[0].replace(":", "_").replace(".", "") for asnp in variantlist - ] # cleaning up the SNP names a bit - stdvariantlist = [] - for variant in variantlist: - if variant == "": - stdvariantlist.append(".") - continue - variantstr = variant.replace("chr", "") - if re.search("^23_", variantstr): - variantstr = variantstr.replace("23_", "X_", 1) - if variantstr.startswith("rs"): - try: - # Here's the difference from the first function version (we look at GTEx first) - if variant in list(variants_df[rsid_colname]): - stdvar = ( - variants_df["variant_id"] - .loc[variants_df[rsid_colname] == variant] - .to_list()[0] - ) - stdvariantlist.append(stdvar) - else: - stdvariantlist.append(rsids[variantstr][0]) - except: - stdvariantlist.append(".") - elif re.search( - r"^\d+_\d+_[A,T,G,C]+_[A,T,C,G]+,*", variantstr.replace("X", "23") - ): - strlist = variantstr.split("_") - strlist = list(filter(None, strlist)) # remove empty strings - try: - achr, astart, aend = parseRegionText( - strlist[0] + ":" + strlist[1] + "-" + str(int(strlist[1]) + 1), - build, - ) - achr = str(achr).replace("23", "X") - if achr == str(chrom) and astart >= startbp and astart <= endbp: - variantstr = ( - variantstr.replace("_" + str(suffix), "") + "_" + str(suffix) - ) - if len(variantstr.split("_")) == 5: - stdvariantlist.append(variantstr) - else: - raise InvalidUsage( - f"Variant format not recognizable: {variant}. Is it from another coordinate build system?", - status_code=410, - ) - else: - stdvariantlist.append(".") - except: - raise InvalidUsage(f"Problem with variant {variant}", status_code=410) - elif re.search(r"^\d+_\d+_*[A,T,G,C]*", variantstr.replace("X", "23")): - strlist = variantstr.split("_") - strlist = list(filter(None, strlist)) # remove empty strings - try: - achr, astart, aend = parseRegionText( - strlist[0] + ":" + strlist[1] + "-" + str(int(strlist[1]) + 1), - build, - ) - achr = str(achr).replace("23", "X") - if achr == str(chrom) and astart >= startbp and astart <= endbp: - if len(strlist) == 3: - aref = strlist[2] - else: - aref = "" - stdvariantlist.append(fetchSNV(achr, astart, aref, build)) - else: - stdvariantlist.append(".") - except: - raise InvalidUsage(f"Problem with variant {variant}", status_code=410) - else: - raise InvalidUsage( - f"Variant format not recognized: {variant}", status_code=410 - ) - return stdvariantlist - - -def cleanSNPs(variantlist, regiontext, build): - """ - Parameters - ---------- - variantlist : list - list of variant IDs in rs id or chr_pos, chr_pos_ref_alt, chr_pos_ref_alt_build, etc formats - regiontext : str - the region of interest in chr:start-end format - build : str - build.lower() in ['hg19','hg38', 'grch37', 'grch38'] must be true - - Returns - ------- - A cleaner set of SNP names - rs id's are cleaned to contain only one, - non-rs id formats are standardized to chr_pos_ref_alt_build format) - any SNPs not in regiontext are returned as '.' - """ - - variantlist = [ - asnp.split(";")[0].replace(":", "_").replace(".", "") for asnp in variantlist - ] # cleaning up the SNP names a bit - std_varlist = standardizeSNPs(variantlist, regiontext, build) - final_varlist = [ - e if (e.startswith("rs") and std_varlist[i] != ".") else std_varlist[i] - for i, e in enumerate(variantlist) - ] - - return final_varlist - - -def torsid(variantlist, regiontext, build): - """ - Parameters - ---------- - variantlist : list - List of variants in either rs id or other chr_pos, chr_pos_ref, chr_pos_ref_alt, chr_pos_ref_alt_build format. - - Returns - ------- - rsidlist : list - Corresponding rs id in the region if found. - Otherwise returns '.' - """ - - if all(x == "." for x in variantlist): - raise InvalidUsage("No variants provided") - - variantlist = cleanSNPs(variantlist, regiontext, build) - - chrom, startbp, endbp = parseRegionText(regiontext, build) - chrom = str(chrom).replace("23", "X") - - # Load dbSNP151 SNP names from region indicated - dbsnp_filepath = "" - suffix = "b37" - if build.lower() in ["hg38", "grch38"]: - suffix = "b38" - dbsnp_filepath = os.path.join( - app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh38p7", "All_20180418.vcf.gz" - ) - else: - suffix = "b37" - dbsnp_filepath = os.path.join( - app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh37p13", "All_20180423.vcf.gz" - ) - - # Load dbSNP file - tbx = pysam.TabixFile(dbsnp_filepath) - # print('Compiling list of known variants in the region from dbSNP151') - chromcol = [] - poscol = [] - idcol = [] - refcol = [] - altcol = [] - rsid = dict({}) # chr_pos_ref_alt_build (keys) for rsid output (values) - for row in tbx.fetch(str(chrom), startbp, endbp): - rowlist = str(row).split("\t") - chromi = rowlist[0].replace("chr", "") - posi = rowlist[1] - idi = rowlist[2] - refi = rowlist[3] - alti = rowlist[4] - varstr = "_".join([chromi, posi, refi, alti, suffix]) - chromcol.append(chromi) - poscol.append(posi) - idcol.append(idi) - refcol.append(refi) - altcol.append(alti) - rsid[varstr] = idi - altalleles = alti.split( - "," - ) # could have more than one alt allele (multi-allelic) - if len(altalleles) > 1: - varstr = "_".join([chromi, posi, refi, altalleles[0], suffix]) - rsid[varstr] = idi - for i in np.arange(len(altalleles) - 1): - varstr = "_".join([chromi, posi, refi, altalleles[i + 1], suffix]) - rsid[varstr] = idi - - finalvarlist = [] - for variant in variantlist: - if not variant.startswith("rs"): - try: - finalvarlist.append(rsid[variant]) - except: - finalvarlist.append(".") - else: - finalvarlist.append(variant) - - return finalvarlist - - -def decomposeVariant(variant_list): - """ - Parameters - ---------- - variantid_list : list - list of str standardized variants in chr_pos_ref_alt_build format - - Returns - ------- - A pandas.dataframe with chromosome, pos, reference and alternate alleles columns - """ - chromlist = [x.split("_")[0] if len(x.split("_")) == 5 else x for x in variant_list] - chromlist = [int(x) if x not in ["X", "."] else x for x in chromlist] - poslist = [ - int(x.split("_")[1]) if len(x.split("_")) == 5 else x for x in variant_list - ] - reflist = [x.split("_")[2] if len(x.split("_")) == 5 else x for x in variant_list] - altlist = [x.split("_")[3] if len(x.split("_")) == 5 else x for x in variant_list] - df = pd.DataFrame( - { - DEFAULT_FORM_VALUE_DICT[FormID.CHROM_COL]: chromlist, - DEFAULT_FORM_VALUE_DICT[FormID.POS_COL]: poslist, - DEFAULT_FORM_VALUE_DICT[FormID.REF_COL]: reflist, - DEFAULT_FORM_VALUE_DICT[FormID.ALT_COL]: altlist, - } - ) - return df - - def addVariantID(gwas_data, chromcol, poscol, refcol, altcol, build="hg19"): """ @@ -791,7 +362,7 @@ def addVariantID(gwas_data, chromcol, poscol, refcol, altcol, build="hg19"): def verifyStdSNPs(stdsnplist, regiontxt, build): # Ensure valid region: - chrom, startbp, endbp = parseRegionText(regiontxt, build) + chrom, startbp, endbp = parse_region_text(regiontxt, build) chrom = str(chrom).replace("23", "X") # Load GTEx variant lookup table for region indicated @@ -820,7 +391,7 @@ def subsetLocus(build, summaryStats, regiontext, chromcol, poscol, pcol): if regiontext == "": regiontext = DEFAULT_FORM_VALUE_DICT[FormID.LOCUS] # print('Parsing region text') - chrom, startbp, endbp = parseRegionText(regiontext, build) + chrom, startbp, endbp = parse_region_text(regiontext, build) summaryStats = summaryStats.loc[ [str(x) != "." for x in list(summaryStats[chromcol])] ].copy() @@ -1055,32 +626,6 @@ def subset_gwas_data_to_entered_columns( return gwas_data, column_dict, infer_variant -def standardize_gwas_variant_ids( - column_dict, gwas_data, regionstr: str, coordinate: str -): - # standardize variant id's: - variant_list = standardizeSNPs( - list(gwas_data[column_dict[FormID.SNP_COL]]), regionstr, coordinate - ) - if all(x == "." for x in variant_list): - raise InvalidUsage( - f"None of the variants provided could be mapped to {regionstr}!", - status_code=410, - ) - # get the chrom, pos, ref, alt info from the standardized variant_list - vardf = decomposeVariant(variant_list) - gwas_data = pd.concat([vardf, gwas_data], axis=1) - column_dict[FormID.CHROM_COL] = DEFAULT_FORM_VALUE_DICT[FormID.CHROM_COL] - column_dict[FormID.POS_COL] = DEFAULT_FORM_VALUE_DICT[FormID.POS_COL] - column_dict[FormID.REF_COL] = DEFAULT_FORM_VALUE_DICT[FormID.REF_COL] - column_dict[FormID.ALT_COL] = DEFAULT_FORM_VALUE_DICT[FormID.ALT_COL] - gwas_data = gwas_data.loc[ - [str(x) != "." for x in list(gwas_data[column_dict[FormID.CHROM_COL]])] - ].copy() - gwas_data.reset_index(drop=True, inplace=True) - return gwas_data, column_dict - - def clean_summary_datasets( summary_datasets, snp_column: str, chrom_column: str ) -> Tuple[List[pd.DataFrame], List[pd.Index]]: @@ -1485,7 +1030,7 @@ def get_multiple_regions(regionstext: str, build: str) -> List[Tuple[int, int, i regionstext = regionstext.splitlines() regions = [] for regiontext in regionstext: - chrom, start, end = parseRegionText(regiontext, build) + chrom, start, end = parse_region_text(regiontext, build) regions.append((chrom, start, end)) return regions @@ -1666,7 +1211,11 @@ def prev_session(): old_session_id = request.form["session-id"].strip() # Check celery session - if not app.config["DISABLE_CELERY"] and get_is_celery_running() and old_session_id != "example-output": + if ( + not app.config["DISABLE_CELERY"] + and get_is_celery_running() + and old_session_id != "example-output" + ): celery_result = AsyncResult(old_session_id, app=app.extensions["celery"]) if celery_result.status == "PENDING": raise InvalidUsage(f"Session {old_session_id} does not exist.") @@ -1725,7 +1274,11 @@ def prev_session(): def prev_session_input(old_session_id): # Check celery session - if not app.config["DISABLE_CELERY"] and get_is_celery_running() and old_session_id != "example-output": + if ( + not app.config["DISABLE_CELERY"] + and get_is_celery_running() + and old_session_id != "example-output" + ): celery_result = AsyncResult(old_session_id, app=app.extensions["celery"]) if celery_result.status == "PENDING": raise InvalidUsage(f"Session {old_session_id} does not exist.") @@ -1885,7 +1438,9 @@ def index(): file = request.files.get(name) if file: filepath = download_file(file) - if filepath is not None and any(str(filepath).endswith(ext) for ext in extensions): + if filepath is not None and any( + str(filepath).endswith(ext) for ext in extensions + ): filepaths.append(filepath) elif required: raise InvalidUsage(f"Missing required file: {description}", status_code=410) diff --git a/app/utils/__init__.py b/app/utils/__init__.py index 03df156..e5cb4b2 100644 --- a/app/utils/__init__.py +++ b/app/utils/__init__.py @@ -1,6 +1,7 @@ """ Common utility functions and classes shared by multiple routes in LocusFocus. """ + import os import re from typing import List, Optional @@ -13,8 +14,9 @@ from werkzeug.exceptions import RequestEntityTooLarge from werkzeug.datastructures import ImmutableMultiDict, FileStorage -from app import mongo +from app.utils.gtex import get_gtex_snps from app.utils.errors import InvalidUsage +from app.utils.helpers import parse_region_text GENOMIC_WINDOW_LIMIT = 2e6 @@ -59,9 +61,11 @@ def download_file(file: FileStorage, check_only: bool = False) -> Optional[os.Pa return filepath -def get_file_with_ext(filepaths: List[os.PathLike], extensions: List[str]) -> Optional[os.PathLike]: +def get_file_with_ext( + filepaths: List[os.PathLike], extensions: List[str] +) -> Optional[os.PathLike]: """ - Grab the first file in the list that matches the given extensions. This assumes that the + Grab the first file in the list that matches the given extensions. This assumes that the files are already uploaded and stored in the upload folder. Return None if no such file exists. @@ -94,11 +98,46 @@ def decompose_variant_list(variant_list): reflist = [x.split("_")[2] if len(x.split("_")) == 5 else x for x in variant_list] altlist = [x.split("_")[3] if len(x.split("_")) == 5 else x for x in variant_list] df = pd.DataFrame( - {"CHROM": chromlist, "POS": poslist, "REF": reflist, "ALT": altlist,} + { + "CHROM": chromlist, + "POS": poslist, + "REF": reflist, + "ALT": altlist, + } ) return df +def get_dbnsp_snps(chrom: str, startbp: int, endbp: int, build: str) -> List[str]: + """Fetch SNPs from dbSNP for a certain region and build. + + :return: The SNPs as a list of strings, the first five values are CHROM (no prefix), + POS, ID, REF, ALT + :rtype: List[str] + """ + + if build.lower() in ["hg38", "grch38"]: + dbsnp_filepath = os.path.join( + app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh38p7", "All_20180418.vcf.gz" + ) + else: + dbsnp_filepath = os.path.join( + app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh37p13", "All_20180423.vcf.gz" + ) + + # Load dbSNP file + tbx = pysam.TabixFile(dbsnp_filepath) # type: ignore + + result = [] + + for row in tbx.fetch(str(chrom), startbp, endbp): + rowlist = row.split("\t") + rowlist[0] = rowlist[0].replace("chr", "") + result.append(rowlist) + + return result + + def standardize_snps(variantlist, regiontxt, build): """ Input: Variant names in any of these formats: rsid, chrom_pos_ref_alt, chrom:pos_ref_alt, chrom:pos_ref_alt_b37/b38 @@ -118,69 +157,28 @@ def standardize_snps(variantlist, regiontxt, build): # Ensure valid region: chrom, startbp, endbp = parse_region_text(regiontxt, build) - chrom = str(chrom).replace("23", "X") - # Load GTEx variant lookup table for region indicated - db = mongo.cx.GTEx_V7 # type: ignore - rsid_colname = "rs_id_dbSNP147_GRCh37p13" - if build.lower() in ["hg38", "grch38"]: - db = mongo.cx.GTEx_V8 # type: ignore - rsid_colname = "rs_id_dbSNP151_GRCh38p7" - collection = db["variant_table"] - variants_query = collection.find( - { - "$and": [ - {"chr": int(chrom.replace("X", "23"))}, - {"variant_pos": {"$gte": int(startbp), "$lte": int(endbp)}}, - ] - } + variants_df = get_gtex_snps(chrom, startbp, endbp, build) + + rsid_colname = ( + "rs_id_dbSNP147_GRCh37p13" + if build.lower() in ["hg38", "grch38"] + else "rs_id_dbSNP151_GRCh38p7" ) - variants_list = list(variants_query) - variants_df = pd.DataFrame(variants_list) - variants_df = variants_df.drop(["_id"], axis=1) - # Load dbSNP151 SNP names from region indicated - dbsnp_filepath = "" - suffix = "b37" - if build.lower() in ["hg38", "grch38"]: - suffix = "b38" - dbsnp_filepath = os.path.join( - app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh38p7", "All_20180418.vcf.gz" - ) - else: - suffix = "b37" - dbsnp_filepath = os.path.join( - app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh37p13", "All_20180423.vcf.gz" - ) + suffix = "b37" if build.lower() in ["hg38", "grch38"] else "b37" + + chrom = str(chrom).replace("23", "X") + + dbsnp_snps = get_dbnsp_snps(chrom, startbp, endbp, build) - # Load dbSNP file - # delayeddf = delayed(pd.read_csv)(dbsnp_filepath,skiprows=getNumHeaderLines(dbsnp_filepath),sep='\t') - # dbsnp = dd.from_delayed(delayeddf) - tbx = pysam.TabixFile(dbsnp_filepath) # type: ignore - # print('Compiling list of known variants in the region from dbSNP151') - chromcol = [] - poscol = [] - idcol = [] - refcol = [] - altcol = [] - variantid = [] # in chr_pos_ref_alt_build format rsids = dict( {} ) # a multi-allelic variant rsid (key) can be represented in several variantid formats (values) - for row in tbx.fetch(str(chrom), startbp, endbp): - rowlist = str(row).split("\t") - chromi = rowlist[0].replace("chr", "") - posi = rowlist[1] - idi = rowlist[2] - refi = rowlist[3] - alti = rowlist[4] + for row in dbsnp_snps: + chromi, posi, idi, refi, alti = row[:5] varstr = "_".join([chromi, posi, refi, alti, suffix]) - chromcol.append(chromi) - poscol.append(posi) - idcol.append(idi) - refcol.append(refi) - altcol.append(alti) - variantid.append(varstr) + rsids[idi] = [varstr] altalleles = alti.split( "," @@ -192,7 +190,6 @@ def standardize_snps(variantlist, regiontxt, build): varstr = "_".join([chromi, posi, refi, altalleles[i + 1], suffix]) rsids[idi].append(varstr) - # print('Cleaning and mapping list of variants') variantlist = [ asnp.split(";")[0].replace(":", "_").replace(".", "") for asnp in variantlist ] # cleaning up the SNP names a bit @@ -201,9 +198,7 @@ def standardize_snps(variantlist, regiontxt, build): if variant == "": stdvariantlist.append(".") continue - variantstr = variant.replace("chr", "") - if re.search("^23_", variantstr): - variantstr = variantstr.replace("23_", "X_", 1) + variantstr = variant.replace("chr", "").replace("23_", "X_", 1) if variantstr.startswith("rs"): try: # Here's the difference from the first function version (we look at GTEx first) @@ -224,7 +219,7 @@ def standardize_snps(variantlist, regiontxt, build): strlist = variantstr.split("_") strlist = list(filter(None, strlist)) # remove empty strings try: - achr, astart, aend = parse_region_text( + achr, astart, _ = parse_region_text( strlist[0] + ":" + strlist[1] + "-" + str(int(strlist[1]) + 1), build, ) @@ -270,68 +265,6 @@ def standardize_snps(variantlist, regiontxt, build): return stdvariantlist -def parse_region_text(regiontext, build): - if build not in ["hg19", "hg38"]: - raise InvalidUsage(f"Unrecognized build: {build}", status_code=410) - regiontext = regiontext.strip().replace(" ", "").replace(",", "").replace("chr", "") - if not re.search( - r"^\d+:\d+-\d+$", regiontext.replace("X", "23").replace("x", "23") - ): - raise InvalidUsage( - f"Invalid coordinate format. '{regiontext}' e.g. 1:205,000,000-206,000,000", - status_code=410, - ) - chrom = regiontext.split(":")[0].lower().replace("chr", "").upper() - pos = regiontext.split(":")[1] - startbp = pos.split("-")[0].replace(",", "") - endbp = pos.split("-")[1].replace(",", "") - chromLengths = pd.read_csv( - os.path.join(app.config["LF_DATA_FOLDER"], build + "_chrom_lengths.txt"), - sep="\t", - encoding="utf-8", - ) - chromLengths.set_index("sequence", inplace=True) - if chrom in ["X", "x"] or chrom == "23": - chrom = 23 - maxChromLength = chromLengths.loc["chrX", "length"] - try: - startbp = int(startbp) - endbp = int(endbp) - except: - raise InvalidUsage( - f"Invalid coordinates input: '{regiontext}'", status_code=410 - ) - else: - try: - chrom = int(chrom) - if chrom == 23: - maxChromLength = chromLengths.loc["chrX", "length"] - else: - maxChromLength = chromLengths.loc["chr" + str(chrom), "length"] - startbp = int(startbp) - endbp = int(endbp) - except: - raise InvalidUsage( - f"Invalid coordinates input '{regiontext}'", status_code=410 - ) - if chrom < 1 or chrom > 23: - raise InvalidUsage("Chromosome input must be between 1 and 23", status_code=410) - elif startbp > endbp: - raise InvalidUsage( - "Starting chromosome basepair position is greater than ending basepair position", - status_code=410, - ) - elif startbp > maxChromLength or endbp > maxChromLength: - raise InvalidUsage("Start or end coordinates are out of range", status_code=410) - elif (endbp - startbp) > GENOMIC_WINDOW_LIMIT: - raise InvalidUsage( - f"Entered region size is larger than {GENOMIC_WINDOW_LIMIT/1e6} Mbp", - status_code=410, - ) - else: - return chrom, startbp, endbp - - def fetch_snv(chrom, bp, ref, build): variantid = "." @@ -343,32 +276,15 @@ def fetch_snv(chrom, bp, ref, build): regiontxt = str(chrom) + ":" + str(bp) + "-" + str(int(bp) + 1) except: raise InvalidUsage(f"Invalid input for {str(chrom):str(bp)}") - chrom, startbp, endbp = parse_region_text(regiontxt, build) + chrom = parse_region_text(regiontxt, build)[0] chrom = str(chrom).replace("chr", "").replace("23", "X") + suffix = "b37" if build.lower() in ["hg38", "grch38"] else "b37" - # Load dbSNP151 SNP names from region indicated - dbsnp_filepath = "" - if build.lower() in ["hg38", "grch38"]: - suffix = "b38" - dbsnp_filepath = os.path.join( - app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh38p7", "All_20180418.vcf.gz" - ) - else: - suffix = "b37" - dbsnp_filepath = os.path.join( - app.config["LF_DATA_FOLDER"], "dbSNP151", "GRCh37p13", "All_20180423.vcf.gz" - ) + dbsnps = get_dbnsp_snps(chrom, bp - 1, bp, build) - # Load variant info from dbSNP151 - tbx = pysam.TabixFile(dbsnp_filepath) # type: ignore varlist = [] - for row in tbx.fetch(str(chrom), bp - 1, bp): - rowlist = str(row).split("\t") - chromi = rowlist[0].replace("chr", "") - posi = rowlist[1] - idi = rowlist[2] - refi = rowlist[3] - alti = rowlist[4] + for row in dbsnps: + chromi, posi, _, refi, alti = row[:5] varstr = "_".join([chromi, posi, refi, alti, suffix]) varlist.append(varstr) diff --git a/app/utils/gencode.py b/app/utils/gencode.py index 7db69b5..d3a5c7f 100644 --- a/app/utils/gencode.py +++ b/app/utils/gencode.py @@ -6,7 +6,7 @@ from typing import List import pandas as pd -from app.utils import parse_region_text +from app.utils.helpers import parse_region_text # /app/utils/gencode.py -> /data/ DATA_FOLDER = os.path.abspath( @@ -26,7 +26,9 @@ ) -def get_genes_by_location(build: str, chrom: int, startbp: int, endbp: int, gencode=False) -> List[str]: +def get_genes_by_location( + build: str, chrom: int, startbp: int, endbp: int, gencode=False +) -> List[str]: """ Given a build and a region, return a list of genes that overlap that region. """ diff --git a/app/utils/gtex.py b/app/utils/gtex.py index 9ef3907..5f865db 100644 --- a/app/utils/gtex.py +++ b/app/utils/gtex.py @@ -1,11 +1,10 @@ - import pandas as pd from flask import current_app as app from pymongo import MongoClient from app import mongo -from app.utils import parse_region_text +from app.utils.helpers import parse_region_text from app.utils.gencode import collapsed_genes_df_hg19, collapsed_genes_df_hg38 from app.utils.errors import InvalidUsage @@ -15,6 +14,7 @@ with app.app_context(): client: MongoClient = mongo.cx # type: ignore + # This is the main function to extract the data for a tissue and gene_id: def get_gtex(version, tissue, gene_id): if version.upper() == "V8": @@ -168,15 +168,9 @@ def get_gtex_data_pvalues(eqtl_data, snp_list): return list(gtex_data["pval"]) -def get_gtex_snp_matches(stdsnplist, regiontxt, build): - """ - Return the number of SNPs that can be found in the GTEx database for the given region. - """ - # Ensure valid region: - chrom, startbp, endbp = parse_region_text(regiontxt, build) - chrom = str(chrom).replace("23", "X") +def get_gtex_snps(chrom: int, startbp: int, endbp: int, build: str) -> pd.DataFrame: + """Fetch SNPs from gtex database for a given chrom, range, and build""" - # Load GTEx variant lookup table for region indicated db = client.GTEx_V7 if build.lower() in ["hg38", "grch38"]: db = client.GTEx_V8 @@ -184,14 +178,25 @@ def get_gtex_snp_matches(stdsnplist, regiontxt, build): variants_query = collection.find( { "$and": [ - {"chr": int(chrom.replace("X", "23"))}, + {"chr": chrom}, {"variant_pos": {"$gte": int(startbp), "$lte": int(endbp)}}, ] } ) variants_list = list(variants_query) - variants_df = pd.DataFrame(variants_list) - variants_df = variants_df.drop(["_id"], axis=1) + return pd.DataFrame(variants_list).drop(["_id"], axis=1) + + +def get_gtex_snp_matches(stdsnplist, regiontxt, build): + """ + Return the number of SNPs that can be found in the GTEx database for the given region. + """ + # Ensure valid region: + chrom, startbp, endbp = parse_region_text(regiontxt, build) + + # Load GTEx variant lookup table for region indicated + variants_df = get_gtex_snps(chrom, startbp, endbp, build) + gtex_std_snplist = list(variants_df["variant_id"]) isInGTEx = [x for x in stdsnplist if x in gtex_std_snplist] return len(isInGTEx) diff --git a/app/utils/helpers.py b/app/utils/helpers.py index f9b7d7b..d0191db 100644 --- a/app/utils/helpers.py +++ b/app/utils/helpers.py @@ -1,7 +1,14 @@ +import os +import re from typing import List +from flask import current_app as app import pandas as pd +from app.utils.errors import InvalidUsage + +GENOMIC_WINDOW_LIMIT = 2e6 + def validate_chromosome( chr: str | int, prefix: str | None = "chr", x_y_numeric: bool = False @@ -35,15 +42,15 @@ def validate_chromosome( def adjust_snp_column( - snps_df: pd.DataFrame, - target_build: str, - snp_col: str = "SNP", - chrom_col: str = "CHROM", - pos_col: str = "POS", - ref_col: str = "REF", - alt_col: str = "ALT", - ignore_alleles: bool = False, - ) -> pd.DataFrame: + snps_df: pd.DataFrame, + target_build: str, + snp_col: str = "SNP", + chrom_col: str = "CHROM", + pos_col: str = "POS", + ref_col: str = "REF", + alt_col: str = "ALT", + ignore_alleles: bool = False, +) -> pd.DataFrame: """ Adjust the SNP column to be consistent with the other columns in the dataframe. rsIDs are ignored. Necessary to perform after LiftOver. @@ -62,8 +69,8 @@ def adjust_snp_column( :type ref_col: str, optional :param alt_col: The name of the alternate allele column, defaults to "ALT" :type alt_col: str, optional - :param ignore_alleles: Whether alleles should be ignored. - If true, alleles will be pulled from the SNP column instead of the ref/alt columns. + :param ignore_alleles: Whether alleles should be ignored. + If true, alleles will be pulled from the SNP column instead of the ref/alt columns. Defaults to False :type ignore_alleles: bool, optional :return: The dataframe with the SNP column adjusted. @@ -77,15 +84,97 @@ def adjust_snp_column( # Mask out rows with rsIDs in the SNP column rsid_mask = snps_df[snp_col].str.contains("rs") if rsid_mask.all(): - # nothing to do, rsIDs are semi-stable + # nothing to do, rsIDs are semi-stable return snps_df if not ignore_alleles: - snps_df.loc[~rsid_mask, snp_col] = snps_df.loc[~rsid_mask, chrom_col].astype(str) + "_" + snps_df.loc[~rsid_mask, pos_col].astype(str) + "_" + snps_df.loc[~rsid_mask, ref_col].astype(str) + "_" + snps_df.loc[~rsid_mask, alt_col].astype(str) + "_" + build_suffix + snps_df.loc[~rsid_mask, snp_col] = ( + snps_df.loc[~rsid_mask, chrom_col].astype(str) + + "_" + + snps_df.loc[~rsid_mask, pos_col].astype(str) + + "_" + + snps_df.loc[~rsid_mask, ref_col].astype(str) + + "_" + + snps_df.loc[~rsid_mask, alt_col].astype(str) + + "_" + + build_suffix + ) else: # get alleles from snp column alleles = snps_df[snp_col].str.split("_", expand=True) alleles.columns = ["chrom", "pos", "ref", "alt", "build"] - snps_df.loc[~rsid_mask, snp_col] = snps_df.loc[~rsid_mask, chrom_col].astype(str) + "_" + snps_df.loc[~rsid_mask, pos_col].astype(str) + "_" + alleles.loc[~rsid_mask, "ref"].astype(str) + "_" + alleles.loc[~rsid_mask, "alt"].astype(str) + "_" + build_suffix + snps_df.loc[~rsid_mask, snp_col] = ( + snps_df.loc[~rsid_mask, chrom_col].astype(str) + + "_" + + snps_df.loc[~rsid_mask, pos_col].astype(str) + + "_" + + alleles.loc[~rsid_mask, "ref"].astype(str) + + "_" + + alleles.loc[~rsid_mask, "alt"].astype(str) + + "_" + + build_suffix + ) return snps_df + + +def parse_region_text(regiontext, build): + if build not in ["hg19", "hg38"]: + raise InvalidUsage(f"Unrecognized build: {build}", status_code=410) + regiontext = regiontext.strip().replace(" ", "").replace(",", "").replace("chr", "") + if not re.search( + r"^\d+:\d+-\d+$", regiontext.replace("X", "23").replace("x", "23") + ): + raise InvalidUsage( + f"Invalid coordinate format. '{regiontext}' e.g. 1:205,000,000-206,000,000", + status_code=410, + ) + chrom = regiontext.split(":")[0].lower().replace("chr", "").upper() + pos = regiontext.split(":")[1] + startbp = pos.split("-")[0].replace(",", "") + endbp = pos.split("-")[1].replace(",", "") + chromLengths = pd.read_csv( + os.path.join(app.config["LF_DATA_FOLDER"], build + "_chrom_lengths.txt"), + sep="\t", + encoding="utf-8", + ) + chromLengths.set_index("sequence", inplace=True) + if chrom in ["X", "x"] or chrom == "23": + chrom = 23 + maxChromLength = chromLengths.loc["chrX", "length"] + try: + startbp = int(startbp) + endbp = int(endbp) + except: + raise InvalidUsage( + f"Invalid coordinates input: '{regiontext}'", status_code=410 + ) + else: + try: + chrom = int(chrom) + if chrom == 23: + maxChromLength = chromLengths.loc["chrX", "length"] + else: + maxChromLength = chromLengths.loc["chr" + str(chrom), "length"] + startbp = int(startbp) + endbp = int(endbp) + except: + raise InvalidUsage( + f"Invalid coordinates input '{regiontext}'", status_code=410 + ) + if chrom < 1 or chrom > 23: + raise InvalidUsage("Chromosome input must be between 1 and 23", status_code=410) + elif startbp > endbp: + raise InvalidUsage( + "Starting chromosome basepair position is greater than ending basepair position", + status_code=410, + ) + elif startbp > maxChromLength or endbp > maxChromLength: + raise InvalidUsage("Start or end coordinates are out of range", status_code=410) + elif (endbp - startbp) > GENOMIC_WINDOW_LIMIT: + raise InvalidUsage( + f"Entered region size is larger than {GENOMIC_WINDOW_LIMIT/1e6} Mbp", + status_code=410, + ) + else: + return chrom, startbp, endbp From 1bfb0eaa56118f94bd6b32b31218bdc7cfa1d55a Mon Sep 17 00:00:00 2001 From: cklamann <12862284+cklamann@users.noreply.github.com> Date: Mon, 17 Nov 2025 16:59:01 -0500 Subject: [PATCH 4/5] ruff --- app/utils/helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/utils/helpers.py b/app/utils/helpers.py index d0191db..6da48e3 100644 --- a/app/utils/helpers.py +++ b/app/utils/helpers.py @@ -145,7 +145,7 @@ def parse_region_text(regiontext, build): try: startbp = int(startbp) endbp = int(endbp) - except: + except Exception: raise InvalidUsage( f"Invalid coordinates input: '{regiontext}'", status_code=410 ) @@ -158,7 +158,7 @@ def parse_region_text(regiontext, build): maxChromLength = chromLengths.loc["chr" + str(chrom), "length"] startbp = int(startbp) endbp = int(endbp) - except: + except Exception: raise InvalidUsage( f"Invalid coordinates input '{regiontext}'", status_code=410 ) From 37db56e66882692a36c06f566fb70f17fea41b64 Mon Sep 17 00:00:00 2001 From: cklamann <12862284+cklamann@users.noreply.github.com> Date: Mon, 17 Nov 2025 17:02:59 -0500 Subject: [PATCH 5/5] cleanup --- app/utils/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/app/utils/__init__.py b/app/utils/__init__.py index f7756ef..35438c1 100644 --- a/app/utils/__init__.py +++ b/app/utils/__init__.py @@ -203,8 +203,6 @@ def standardize_snps(variantlist, regiontxt, build): if variant == "": stdvariantlist.append(".") continue - # this seems dangerous, what if format is chr1_123_A_T? - # then you would have chr1_1X_A_T variantstr = variant.replace("chr23_", "chrX_").replace("chr", "") if variantstr.startswith("rs"): try: