diff --git a/.gitignore b/.gitignore index 58d6e34..5ac6900 100644 --- a/.gitignore +++ b/.gitignore @@ -141,6 +141,7 @@ data/gwas.public.txt.gz data/uk_biobank/ data/mQTL_eQTM_SNPs_SAKNORM_For_CoLoc*.txt data/Note.txt +misc/**/test* data/slc9a3_gwas_mqtl.html data/*lookup_table.txt.gz .Rproj.user diff --git a/app/__init__.py b/app/__init__.py index d31ca78..3b57336 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -9,6 +9,7 @@ from flask_sitemap import Sitemap from flask_talisman import Talisman +from app.commands import register_cli from app.config import BaseConfig, DevConfig, ProdConfig from app.cache import cache @@ -66,6 +67,7 @@ def create_app(config_class=ConfigClass): mongo.init_app(app) celery_init_app(app) cache.init_app(app) + register_cli(app) if app.config["CACHE_TYPE"] == "NullCache": app.logger.debug("Cache is disabled") diff --git a/app/commands.py b/app/commands.py new file mode 100644 index 0000000..1b4330b --- /dev/null +++ b/app/commands.py @@ -0,0 +1,82 @@ +from os import path +from pathlib import Path +import shutil +import tarfile +import zipfile + +import click +from flask import Flask, current_app +import requests +import tqdm + + +def register_cli(app: Flask): + @app.cli.command(name="download-smr-mqtl", help="Download mQTL files") + @click.option( + "--lite", + is_flag=True, + help="Download 'lite' version of the McRae et al. mQTL data (only SNPs with P < 1e-5 are included; 241 MB)", + ) + def download_smr_mqtl(lite): + # https://yanglab.westlake.edu.cn/software/smr/#mQTLsummarydata + file_list = [ + # Whole blood mQTL data set used in Hannon et al. (2018 AJHG).(121MB) + # Saved as US_mQTLS_SMR_format + "https://yanglab.westlake.edu.cn/data/SMR/US_mQTLS_SMR_format.zip", + # 42MB + "https://yanglab.westlake.edu.cn/data/SMR/Hannon_Blood_dataset1.zip", + # 25MB + "https://yanglab.westlake.edu.cn/data/SMR/Hannon_Blood_dataset2.zip", + # https://yanglab.westlake.edu.cn/data/SMR/Hannon_FetalBrain.zip (4.8MB) + "https://yanglab.westlake.edu.cn/data/SMR/Hannon_FetalBrain.zip", + # mQTL summary data from a meta-analysis of samples of East Asian ancestry. (2.5GB) + # no particular tissue? saved as EAS + "https://yanglab.westlake.edu.cn/data/SMR/EAS.tar.gz", + # mQTL summary data from a meta-analysis of samples of European ancestry. (3.7GB) + # no particular tissue? saved as EUR + "https://yanglab.westlake.edu.cn/data/SMR/EUR.tar.gz", + # Brain-mMeta mQTL summary data (Qi et al. 2018 Nat Commun) in SMR binary (BESD) format: Brain-mMeta.tar.gz (893 MB) + # brain (from meta-analysis) + "https://yanglab.westlake.edu.cn/data/SMR/Brain-mMeta.tar.gz", + ] + ( + # Lite version of the McRae et al. mQTL data (only SNPs with P < 1e-5 are included; 241 MB) + # peripheral blood + ["https://yanglab.westlake.edu.cn/data/SMR/LBC_BSGS_meta_lite.tar.gz"] + if lite + else [ + # McRae et al. mQTL summary data (7.5 GB) + "https://yanglab.westlake.edu.cn/data/SMR/LBC_BSGS_meta.tar.gz", + ] + ) + + data_dir = path.join(path.dirname(path.dirname(__file__)), "data", "smr_mqtl") + Path(data_dir).mkdir(exist_ok=True, parents=True) + + for file_url in file_list: + filename = path.basename(file_url) + _, ext = path.splitext(filename) + tmp_save_path = path.join(data_dir, filename) + with requests.get(file_url, stream=True) as r: + current_app.logger.info(f"Downloading {filename}...") + if r.status_code != 200: + r.raise_for_status() + raise RuntimeError( + f"Request to {file_url} returned status code {r.status_code}" + ) + file_size = int(r.headers.get("Content-Length", 0)) + desc = "(Unknown total file size)" if file_size == 0 else "" + with tqdm.tqdm.wrapattr( + r.raw, "read", total=file_size, desc=desc + ) as r_raw: + with open(tmp_save_path, "wb") as fd: + shutil.copyfileobj(r_raw, fd) + + if ext == ".zip": + with zipfile.ZipFile(tmp_save_path) as zf: + zf.extractall(data_dir) + Path(tmp_save_path).unlink() + elif ext == ".gz": + with tarfile.open(tmp_save_path) as tf: + tf.extractall(data_dir, filter="data") + Path(tmp_save_path).unlink() + tf.close() diff --git a/app/utils/smr.py b/app/utils/smr.py new file mode 100644 index 0000000..ab6395b --- /dev/null +++ b/app/utils/smr.py @@ -0,0 +1,176 @@ +import os +import re +import subprocess +from tempfile import NamedTemporaryFile +from typing import List, Literal, TypedDict + +import pandas as pd + +curr_dir = os.path.dirname(__file__) +data_dir = os.path.join(os.path.dirname(os.path.dirname(curr_dir)), "data", "smr_mqtl") + + +class SMRDataset(TypedDict): + assembly: Literal["hg19", "hg38"] + base_filename: str + by_chr: bool + + +smr_datasets: dict[str, SMRDataset] = { + "Brain-mMeta": { + "assembly": "hg38", + "by_chr": False, + "base_filename": "Brain-mMeta", + }, + "EAS": { + "assembly": "hg38", + "by_chr": True, + "base_filename": "EAS", + }, + "EUR": { + "assembly": "hg38", + "by_chr": True, + "base_filename": "EUR", + }, + "Hannon et al. Blood dataset1": { + "assembly": "hg19", + "by_chr": False, + "base_filename": "Aberdeen_Blood", + }, + "Hannon et al. Blood dataset2": { + "assembly": "hg19", + "by_chr": False, + "base_filename": "UCL_Blood", + }, + "Hannon et al. FetalBrain": { + "assembly": "hg19", + "by_chr": False, + "base_filename": "FB_Brain", + }, + "LBC_BSGS_meta": { + "assembly": "hg19", + "by_chr": True, + "base_filename": "bl_mqtl", + }, + "LBC_BSGS_meta_lite": { + "assembly": "hg19", + "by_chr": True, + "base_filename": "bl_mqtl_lite", + }, + "US_mQTLS_SMR_format": { + "assembly": "hg19", + "by_chr": False, + "base_filename": "US_Blood", + }, +} + + +def run_smr_query( + query_path: str, chr: int, thresh: float, start: int, end: int +) -> pd.DataFrame: + """Query the data, save to a temporary file, and return as a dataframe + + :param query_path: The path to the data to be queried + :type query_path: str + :param chr: The chromosome to query + :type chr: int + :param thresh: The p-value threshold + :type thresh: float + :param start: The start bp to query + :type start: int + :param end: Then end bp to query + :type end: int + :return: The returned SNPs and pvalues + :rtype: pd.DataFrame + """ + with NamedTemporaryFile("w") as f: + query = [ + "smr", + "--beqtl-summary", + query_path, + "--query", + str(thresh), + "--snp-chr", + str(chr), + "--from-snp-kb", + str(start), + "--to-snp-kb", + str(end), + "--out", + f.name, + ] + + subprocess.run(query, check=True) + + return pd.read_csv(f"{f.name}.txt", sep="\t") + + +def query_smr( + chr: int, + snps: List[str], + dataset: str, + thresh: float = 5.0e-8, + assembly: Literal["hg19", "hg38"] = "hg38", +) -> pd.DataFrame: + """Query mqtl data in smr format + + :param chr: The chromosome to query + :type chr: int + :param snps: A list of SNPS in format chr{chr}_{bp}_ref_alt + :type snps: List[str] + :param dataset: The dataset to query + :type dataset: str + :param thresh: The p-value threshold, defaults to 5e-8 + :type thresh: float + :param assembly: The genome assembly to use, defaults to "hg38" + :type assembly: Literal["hg19", "hg38"] + :raises FileNotFoundError: If the dataset does not exist + :raises ValueError: If the requested assembly does not match the dataset assembly + :return: The SNPs and pvalues as a dataframe with the following columns: + 'SNP', 'Chr', 'BP', 'A1', 'A2', 'Freq', 'Probe', 'Probe_Chr', + 'Probe_bp', 'Gene', 'Orientation', 'b', 'SE', 'p', 'full_snp' + Note that 'full_snp' is a combined column that takes the same format as those in ``snps`` + :rtype: pd.DataFrame + """ + if dataset not in smr_datasets.keys(): + raise FileNotFoundError(f"Dataset {dataset} does not exist!") + + if smr_datasets[dataset]["assembly"] != assembly: + raise ValueError( + f"Dataset {dataset} uses {smr_datasets[dataset]['assembly']} but {assembly} was requested!" + ) + + dataset_dir = os.path.join(data_dir, dataset) + base_filepath = os.path.join(dataset_dir, dataset) + if smr_datasets[dataset]["by_chr"]: + base_filepath = f"{base_filepath}_chr{chr}" + + regex = r"_(\d+)_" + + snp_poses = [int(re.findall(regex, snp)[0]) for snp in snps] + + start = min(snp_poses) // 1000 + end = max(snp_poses) // 1000 + 1 + + query_result = run_smr_query( + query_path=base_filepath, + chr=chr, + thresh=thresh, + start=start, + end=end, + ) + + query_result["full_snp"] = query_result.apply( + lambda df: f"chr{str(df['Chr'])}" + + "_" + + str(df["BP"]) + + "_" + + df["A1"] + + "_" + + df["A2"], + axis=1, + ) + + filtered = query_result[query_result["full_snp"].isin(snps)] + + return filtered diff --git a/setup/setup-smr.sh b/setup/setup-smr.sh new file mode 100644 index 0000000..656fd2f --- /dev/null +++ b/setup/setup-smr.sh @@ -0,0 +1,7 @@ +#!/bin/bash +set -e + +wget https://yanglab.westlake.edu.cn/software/smr/download/smr-1.3.2-linux-x86_64.zip +unzip smr-1.3.2-linux-x86_64.zip +mv smr-1.3.2-linux-x86_64/smr misc/ +rm smr-1.3.2-linux-x86_64.zip diff --git a/tests/utils/test_query_smr.py b/tests/utils/test_query_smr.py new file mode 100644 index 0000000..34a113a --- /dev/null +++ b/tests/utils/test_query_smr.py @@ -0,0 +1,99 @@ +from unittest.mock import patch, Mock + +import pandas as pd + +from app.utils.smr import query_smr + +mock_result = pd.DataFrame( + [ + [ + "chr1:982513", + 1, + 982513, + "T", + "C", + 0.074442, + "cg24669183", + 1, + 534242, + pd.NA, + "N", + 0.127188, + 0.058496, + 0.029684, + ], + [ + "chr1:982513", + 1, + 982513, + "T", + "C", + 0.074442, + "cg12726839", + 1, + 845311, + pd.NA, + "N", + 0.180720, + 0.058765, + 0.002103, + ], + [ + "chr1:982513", + 1, + 982513, + "A", + "T", + 0.074442, + "cg12726839", + 1, + 845311, + pd.NA, + "N", + 0.180720, + 0.058765, + 0.002103, + ], + ], + columns=[ + "SNP", + "Chr", + "BP", + "A1", + "A2", + "Freq", + "Probe", + "Probe_Chr", + "Probe_bp", + "Gene", + "Orientation", + "b", + "SE", + "p", + ], +) + + +@patch("app.utils.smr.run_smr_query", return_value=mock_result) +def test_query_smr(mock: Mock): + """Test the query function with mock data, since smr files are not committed to source, + ensuring that filtering and query construction functions are correst. + """ + chr = 1 + snps = ["chr1_982513_T_C"] + dataset = "EUR" + thresh = 1 + + res = query_smr(chr, snps, dataset, thresh, "hg38") + + assert len(res) == 2 + + assert len(res["full_snp"].isin(snps)) == 2 + + mock.assert_called_once_with( + query_path="/code/data/smr_mqtl/EUR/EUR_chr1", + thresh=1, + chr=1, + start=982, + end=983, + )