strug-hub · cklamann · Jun 23, 2025 · Jul 15, 2025 · Jul 21, 2025 · Jul 21, 2025
diff --git a/.gitignore b/.gitignore
@@ -141,6 +141,7 @@ data/gwas.public.txt.gz
 data/uk_biobank/
 data/mQTL_eQTM_SNPs_SAKNORM_For_CoLoc*.txt
 data/Note.txt
+misc/**/test*
 data/slc9a3_gwas_mqtl.html
 data/*lookup_table.txt.gz
 .Rproj.user

diff --git a/app/__init__.py b/app/__init__.py
@@ -9,6 +9,7 @@
 from flask_sitemap import Sitemap
 from flask_talisman import Talisman
 
+from app.commands import register_cli
 from app.config import BaseConfig, DevConfig, ProdConfig
 from app.cache import cache
 
@@ -66,6 +67,7 @@ def create_app(config_class=ConfigClass):
     mongo.init_app(app)
     celery_init_app(app)
     cache.init_app(app)
+    register_cli(app)
 
     if app.config["CACHE_TYPE"] == "NullCache":
         app.logger.debug("Cache is disabled")

diff --git a/app/commands.py b/app/commands.py
@@ -0,0 +1,82 @@
+from os import path
+from pathlib import Path
+import shutil
+import tarfile
+import zipfile
+
+import click
+from flask import Flask, current_app
+import requests
+import tqdm
+
+
+def register_cli(app: Flask):
+    @app.cli.command(name="download-smr-mqtl", help="Download mQTL files")
+    @click.option(
+        "--lite",
+        is_flag=True,
+        help="Download 'lite' version of the McRae et al. mQTL data (only SNPs with P < 1e-5 are included; 241 MB)",
+    )
+    def download_smr_mqtl(lite):
+        # https://yanglab.westlake.edu.cn/software/smr/#mQTLsummarydata
+        file_list = [
+            # Whole blood mQTL data set used in Hannon et al. (2018 AJHG).(121MB)
+            # Saved as US_mQTLS_SMR_format
+            "https://yanglab.westlake.edu.cn/data/SMR/US_mQTLS_SMR_format.zip",
+            # 42MB
+            "https://yanglab.westlake.edu.cn/data/SMR/Hannon_Blood_dataset1.zip",
+            # 25MB
+            "https://yanglab.westlake.edu.cn/data/SMR/Hannon_Blood_dataset2.zip",
+            # https://yanglab.westlake.edu.cn/data/SMR/Hannon_FetalBrain.zip (4.8MB)
+            "https://yanglab.westlake.edu.cn/data/SMR/Hannon_FetalBrain.zip",
+            # mQTL summary data from a meta-analysis of samples of East Asian ancestry. (2.5GB)
+            # no particular tissue? saved as EAS
+            "https://yanglab.westlake.edu.cn/data/SMR/EAS.tar.gz",
+            # mQTL summary data from a meta-analysis of samples of European ancestry. (3.7GB)
+            # no particular tissue? saved as EUR
+            "https://yanglab.westlake.edu.cn/data/SMR/EUR.tar.gz",
+            # Brain-mMeta mQTL summary data (Qi et al. 2018 Nat Commun) in SMR binary (BESD) format: Brain-mMeta.tar.gz (893 MB)
+            # brain (from meta-analysis)
+            "https://yanglab.westlake.edu.cn/data/SMR/Brain-mMeta.tar.gz",
+        ] + (
+            # Lite version of the McRae et al. mQTL data (only SNPs with P < 1e-5 are included; 241 MB)
+            # peripheral blood
+            ["https://yanglab.westlake.edu.cn/data/SMR/LBC_BSGS_meta_lite.tar.gz"]
+            if lite
+            else [
+                # McRae et al. mQTL summary data (7.5 GB)
+                "https://yanglab.westlake.edu.cn/data/SMR/LBC_BSGS_meta.tar.gz",
+            ]
+        )
+
+        data_dir = path.join(path.dirname(path.dirname(__file__)), "data", "smr_mqtl")
+        Path(data_dir).mkdir(exist_ok=True, parents=True)
+
+        for file_url in file_list:
+            filename = path.basename(file_url)
+            _, ext = path.splitext(filename)
+            tmp_save_path = path.join(data_dir, filename)
+            with requests.get(file_url, stream=True) as r:
+                current_app.logger.info(f"Downloading {filename}...")
+                if r.status_code != 200:
+                    r.raise_for_status()
+                    raise RuntimeError(
+                        f"Request to {file_url} returned status code {r.status_code}"
+                    )
+                file_size = int(r.headers.get("Content-Length", 0))
+                desc = "(Unknown total file size)" if file_size == 0 else ""
+                with tqdm.tqdm.wrapattr(
+                    r.raw, "read", total=file_size, desc=desc
+                ) as r_raw:
+                    with open(tmp_save_path, "wb") as fd:
+                        shutil.copyfileobj(r_raw, fd)
+
+            if ext == ".zip":
+                with zipfile.ZipFile(tmp_save_path) as zf:
+                    zf.extractall(data_dir)
+                    Path(tmp_save_path).unlink()
+            elif ext == ".gz":
+                with tarfile.open(tmp_save_path) as tf:
+                    tf.extractall(data_dir, filter="data")
+                    Path(tmp_save_path).unlink()
+                    tf.close()
diff --git a/app/utils/smr.py b/app/utils/smr.py
@@ -0,0 +1,176 @@
+import os
+import re
+import subprocess
+from tempfile import NamedTemporaryFile
+from typing import List, Literal, TypedDict
+
+import pandas as pd
+
+curr_dir = os.path.dirname(__file__)
+data_dir = os.path.join(os.path.dirname(os.path.dirname(curr_dir)), "data", "smr_mqtl")
+
+
+class SMRDataset(TypedDict):
+    assembly: Literal["hg19", "hg38"]
+    base_filename: str
+    by_chr: bool
+
+
+smr_datasets: dict[str, SMRDataset] = {
+    "Brain-mMeta": {
+        "assembly": "hg38",
+        "by_chr": False,
+        "base_filename": "Brain-mMeta",
+    },
+    "EAS": {
+        "assembly": "hg38",
+        "by_chr": True,
+        "base_filename": "EAS",
+    },
+    "EUR": {
+        "assembly": "hg38",
+        "by_chr": True,
+        "base_filename": "EUR",
+    },
+    "Hannon et al. Blood dataset1": {
+        "assembly": "hg19",
+        "by_chr": False,
+        "base_filename": "Aberdeen_Blood",
+    },
+    "Hannon et al. Blood dataset2": {
+        "assembly": "hg19",
+        "by_chr": False,
+        "base_filename": "UCL_Blood",
+    },
+    "Hannon et al. FetalBrain": {
+        "assembly": "hg19",
+        "by_chr": False,
+        "base_filename": "FB_Brain",
+    },
+    "LBC_BSGS_meta": {
+        "assembly": "hg19",
+        "by_chr": True,
+        "base_filename": "bl_mqtl",
+    },
+    "LBC_BSGS_meta_lite": {
+        "assembly": "hg19",
+        "by_chr": True,
+        "base_filename": "bl_mqtl_lite",
+    },
+    "US_mQTLS_SMR_format": {
+        "assembly": "hg19",
+        "by_chr": False,
+        "base_filename": "US_Blood",
+    },
+}
+
+
+def run_smr_query(
+    query_path: str, chr: int, thresh: float, start: int, end: int
+) -> pd.DataFrame:
+    """Query the data, save to a temporary file, and return as a dataframe
+
+    :param query_path: The path to the data to be queried
+    :type query_path: str
+    :param chr: The chromosome to query
+    :type chr: int
+    :param thresh: The p-value threshold
+    :type thresh: float
+    :param start: The start bp to query
+    :type start: int
+    :param end: Then end bp to query
+    :type end: int
+    :return: The returned SNPs and pvalues
+    :rtype: pd.DataFrame
+    """
+    with NamedTemporaryFile("w") as f:
+        query = [
+            "smr",
+            "--beqtl-summary",
+            query_path,
+            "--query",
+            str(thresh),
+            "--snp-chr",
+            str(chr),
+            "--from-snp-kb",
+            str(start),
+            "--to-snp-kb",
+            str(end),
+            "--out",
+            f.name,
+        ]
+
+        subprocess.run(query, check=True)
+
+        return pd.read_csv(f"{f.name}.txt", sep="\t")
+
+
+def query_smr(
+    chr: int,
+    snps: List[str],
+    dataset: str,
+    thresh: float = 5.0e-8,
+    assembly: Literal["hg19", "hg38"] = "hg38",
+) -> pd.DataFrame:
+    """Query mqtl data in smr format
+
+    :param chr: The chromosome to query
+    :type chr: int
+    :param snps: A list of SNPS in format chr{chr}_{bp}_ref_alt
+    :type snps: List[str]
+    :param dataset: The dataset to query
+    :type dataset: str
+    :param thresh: The p-value threshold, defaults to 5e-8
+    :type thresh: float
+    :param assembly: The genome assembly to use, defaults to "hg38"
+    :type assembly: Literal["hg19", "hg38"]
+    :raises FileNotFoundError: If the dataset does not exist
+    :raises ValueError: If the requested assembly does not match the dataset assembly
+    :return: The SNPs and pvalues as a dataframe with the following columns:
+    'SNP', 'Chr', 'BP', 'A1', 'A2', 'Freq', 'Probe', 'Probe_Chr',
+    'Probe_bp', 'Gene', 'Orientation', 'b', 'SE', 'p', 'full_snp'
+    Note that 'full_snp' is a combined column that takes the same format as those in ``snps``
+    :rtype: pd.DataFrame
+    """
+    if dataset not in smr_datasets.keys():
+        raise FileNotFoundError(f"Dataset {dataset} does not exist!")
+
+    if smr_datasets[dataset]["assembly"] != assembly:
+        raise ValueError(
+            f"Dataset {dataset} uses {smr_datasets[dataset]['assembly']} but {assembly} was requested!"
+        )
+
+    dataset_dir = os.path.join(data_dir, dataset)
+    base_filepath = os.path.join(dataset_dir, dataset)
+    if smr_datasets[dataset]["by_chr"]:
+        base_filepath = f"{base_filepath}_chr{chr}"
+
+    regex = r"_(\d+)_"
+
+    snp_poses = [int(re.findall(regex, snp)[0]) for snp in snps]
+
+    start = min(snp_poses) // 1000
+    end = max(snp_poses) // 1000 + 1
+
+    query_result = run_smr_query(
+        query_path=base_filepath,
+        chr=chr,
+        thresh=thresh,
+        start=start,
+        end=end,
+    )
+
+    query_result["full_snp"] = query_result.apply(
+        lambda df: f"chr{str(df['Chr'])}"
+        + "_"
+        + str(df["BP"])
+        + "_"
+        + df["A1"]
+        + "_"
+        + df["A2"],
+        axis=1,
+    )
+
+    filtered = query_result[query_result["full_snp"].isin(snps)]
+
+    return filtered
diff --git a/setup/setup-smr.sh b/setup/setup-smr.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+set -e
+
+wget https://yanglab.westlake.edu.cn/software/smr/download/smr-1.3.2-linux-x86_64.zip
+unzip smr-1.3.2-linux-x86_64.zip
+mv smr-1.3.2-linux-x86_64/smr misc/
+rm smr-1.3.2-linux-x86_64.zip
diff --git a/tests/utils/test_query_smr.py b/tests/utils/test_query_smr.py
@@ -0,0 +1,99 @@
+from unittest.mock import patch, Mock
+
+import pandas as pd
+
+from app.utils.smr import query_smr
+
+mock_result = pd.DataFrame(
+    [
+        [
+            "chr1:982513",
+            1,
+            982513,
+            "T",
+            "C",
+            0.074442,
+            "cg24669183",
+            1,
+            534242,
+            pd.NA,
+            "N",
+            0.127188,
+            0.058496,
+            0.029684,
+        ],
+        [
+            "chr1:982513",
+            1,
+            982513,
+            "T",
+            "C",
+            0.074442,
+            "cg12726839",
+            1,
+            845311,
+            pd.NA,
+            "N",
+            0.180720,
+            0.058765,
+            0.002103,
+        ],
+        [
+            "chr1:982513",
+            1,
+            982513,
+            "A",
+            "T",
+            0.074442,
+            "cg12726839",
+            1,
+            845311,
+            pd.NA,
+            "N",
+            0.180720,
+            0.058765,
+            0.002103,
+        ],
+    ],
+    columns=[
+        "SNP",
+        "Chr",
+        "BP",
+        "A1",
+        "A2",
+        "Freq",
+        "Probe",
+        "Probe_Chr",
+        "Probe_bp",
+        "Gene",
+        "Orientation",
+        "b",
+        "SE",
+        "p",
+    ],
+)
+
+
+@patch("app.utils.smr.run_smr_query", return_value=mock_result)
+def test_query_smr(mock: Mock):
+    """Test the query function with mock data, since smr files are not committed to source,
+    ensuring that filtering and query construction functions are correst.
+    """
+    chr = 1
+    snps = ["chr1_982513_T_C"]
+    dataset = "EUR"
+    thresh = 1
+
+    res = query_smr(chr, snps, dataset, thresh, "hg38")
+
+    assert len(res) == 2
+
+    assert len(res["full_snp"].isin(snps)) == 2
+
+    mock.assert_called_once_with(
+        query_path="/code/data/smr_mqtl/EUR/EUR_chr1",
+        thresh=1,
+        chr=1,
+        start=982,
+        end=983,
+    )