From 8136f36313f955dd4108502b599f68c708f7a502 Mon Sep 17 00:00:00 2001
From: cklamann <12862284+cklamann@users.noreply.github.com>
Date: Mon, 23 Jun 2025 13:28:34 -0400
Subject: [PATCH 1/7] add click command to download mqtl files

---
 Dockerfile      | 10 ++++++-
 app/__init__.py |  2 ++
 app/commands.py | 79 +++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 90 insertions(+), 1 deletion(-)
 create mode 100644 app/commands.py

diff --git a/Dockerfile b/Dockerfile
index e006f5f..bfb6aa3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -42,7 +42,8 @@ RUN apt-get update && \
   liblapack-dev \
   libpcre2-dev \
   xauth \
-  vim
+  vim \
+  zip
 
 # install plink
 RUN wget -O /tmp/plink.zip https://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20241022.zip \
@@ -50,6 +51,13 @@ RUN wget -O /tmp/plink.zip https://s3.amazonaws.com/plink1-assets/plink_linux_x8
   && mv /tmp/plink /usr/local/bin/plink \
   && rm /tmp/*
 
+# install smr for mqtl querying
+RUN wget https://yanglab.westlake.edu.cn/software/smr/download/smr-1.3.2-linux-x86_64.zip \
+  && unzip smr-1.3.2-linux-x86_64.zip \
+  && mv smr-1.3.2-linux-x86_64/smr /usr/local/bin/ \
+  && rm smr-1.3.2-linux-x86_64.zip
+
+
 # Install Poetry
 # https://github.com/python-poetry/poetry/issues/6397#issuecomment-1236327500
 ENV POETRY_HOME=/opt/poetry
diff --git a/app/__init__.py b/app/__init__.py
index ab3bd07..631cdbf 100644
--- a/app/__init__.py
+++ b/app/__init__.py
@@ -9,6 +9,7 @@
 from flask_sitemap import Sitemap
 from flask_talisman import Talisman
 
+from app.commands import register_cli
 from app.config import BaseConfig, DevConfig, ProdConfig
 
 ConfigClass = ProdConfig if BaseConfig.APP_ENV == "production" else DevConfig
@@ -64,6 +65,7 @@ def create_app(config_class=ConfigClass):
     talisman.init_app(app, content_security_policy=app.config["CSP_POLICY"])
     mongo.init_app(app)
     celery_init_app(app)
+    register_cli(app)
 
     with app.app_context():
         from app import routes
diff --git a/app/commands.py b/app/commands.py
new file mode 100644
index 0000000..18ed8d3
--- /dev/null
+++ b/app/commands.py
@@ -0,0 +1,79 @@
+from os import path
+from pathlib import Path
+import shutil
+import tarfile
+import zipfile
+
+import click
+from flask import Flask, current_app
+import requests
+import tqdm
+
+
+def register_cli(app: Flask):
+    @app.cli.command(name="download-smr-eqtl", help="Download mQTL files")
+    @click.option(
+        "--lite",
+        is_flag=True,
+        help="Download 'lite' version of the McRae et al. mQTL data (only SNPs with P < 1e-5 are included; 241 MB)",
+    )
+    def download_smr_mqtl(lite):
+        # https://yanglab.westlake.edu.cn/software/smr/#mQTLsummarydata
+        file_list = [
+            # Whole blood mQTL data set used in Hannon et al. (2018 AJHG).(121MB)
+            "https://yanglab.westlake.edu.cn/data/SMR/US_mQTLS_SMR_format.zip",
+            # 42MB
+            "https://yanglab.westlake.edu.cn/data/SMR/Hannon_Blood_dataset1.zip",
+            # 25MB
+            "https://yanglab.westlake.edu.cn/data/SMR/Hannon_Blood_dataset2.zip",
+            # https://yanglab.westlake.edu.cn/data/SMR/Hannon_FetalBrain.zip (4.8MB)
+            "https://yanglab.westlake.edu.cn/data/SMR/Hannon_FetalBrain.zip",
+            # mQTL summary data from a meta-analysis of samples of East Asian ancestry. (2.5GB)
+            "https://yanglab.westlake.edu.cn/data/SMR/EAS.tar.gz",
+            # mQTL summary data from a meta-analysis of samples of European ancestry. (3.7GB)
+            "https://yanglab.westlake.edu.cn/data/SMR/EUR.tar.gz",
+            # Brain-mMeta mQTL summary data (Qi et al. 2018 Nat Commun) in SMR binary (BESD) format: Brain-mMeta.tar.gz (893 MB)
+            "https://yanglab.westlake.edu.cn/data/SMR/Brain-mMeta.tar.gz",
+        ] + (
+            # Lite version of the McRae et al. mQTL data (only SNPs with P < 1e-5 are included; 241 MB)
+            ["https://yanglab.westlake.edu.cn/data/SMR/LBC_BSGS_meta_lite.tar.gz"]
+            if lite
+            else [
+                # McRae et al. mQTL summary data (7.5 GB)
+                "https://yanglab.westlake.edu.cn/data/SMR/LBC_BSGS_meta.tar.gz",
+            ]
+        )
+
+        data_dir = path.join(path.dirname(path.dirname(__file__)), "data", "smr_eqtl")
+        Path(data_dir).mkdir(exist_ok=True, parents=True)
+
+        file_list.reverse()
+
+        for file_url in file_list:
+            filename = path.basename(file_url)
+            _, ext = path.splitext(filename)
+            tmp_save_path = path.join(data_dir, filename)
+            with requests.get(file_url, stream=True) as r:
+                current_app.logger.info(f"Downloading {filename}...")
+                if r.status_code != 200:
+                    r.raise_for_status()
+                    raise RuntimeError(
+                        f"Request to {file_url} returned status code {r.status_code}"
+                    )
+                file_size = int(r.headers.get("Content-Length", 0))
+                desc = "(Unknown total file size)" if file_size == 0 else ""
+                with tqdm.tqdm.wrapattr(
+                    r.raw, "read", total=file_size, desc=desc
+                ) as r_raw:
+                    with open(tmp_save_path, "wb") as fd:
+                        shutil.copyfileobj(r_raw, fd)
+
+            if ext == ".zip":
+                with zipfile.ZipFile(tmp_save_path) as zf:
+                    zf.extractall(data_dir)
+                    Path(tmp_save_path).unlink()
+            elif ext == ".gz":
+                with tarfile.open(tmp_save_path) as tf:
+                    tf.extractall(data_dir, filter="data")
+                    Path(tmp_save_path).unlink()
+                    tf.close()

From f76a819c0f47fb8fd3874afae9caf0e8f66b243e Mon Sep 17 00:00:00 2001
From: cklamann <12862284+cklamann@users.noreply.github.com>
Date: Tue, 15 Jul 2025 10:36:42 -0400
Subject: [PATCH 2/7] sketch out mqtl query function

---
 app/commands.py  |  11 +++--
 app/utils/smr.py | 123 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 130 insertions(+), 4 deletions(-)
 create mode 100644 app/utils/smr.py

diff --git a/app/commands.py b/app/commands.py
index 18ed8d3..1b4330b 100644
--- a/app/commands.py
+++ b/app/commands.py
@@ -11,7 +11,7 @@
 
 
 def register_cli(app: Flask):
-    @app.cli.command(name="download-smr-eqtl", help="Download mQTL files")
+    @app.cli.command(name="download-smr-mqtl", help="Download mQTL files")
     @click.option(
         "--lite",
         is_flag=True,
@@ -21,6 +21,7 @@ def download_smr_mqtl(lite):
         # https://yanglab.westlake.edu.cn/software/smr/#mQTLsummarydata
         file_list = [
             # Whole blood mQTL data set used in Hannon et al. (2018 AJHG).(121MB)
+            # Saved as US_mQTLS_SMR_format
             "https://yanglab.westlake.edu.cn/data/SMR/US_mQTLS_SMR_format.zip",
             # 42MB
             "https://yanglab.westlake.edu.cn/data/SMR/Hannon_Blood_dataset1.zip",
@@ -29,13 +30,17 @@ def download_smr_mqtl(lite):
             # https://yanglab.westlake.edu.cn/data/SMR/Hannon_FetalBrain.zip (4.8MB)
             "https://yanglab.westlake.edu.cn/data/SMR/Hannon_FetalBrain.zip",
             # mQTL summary data from a meta-analysis of samples of East Asian ancestry. (2.5GB)
+            # no particular tissue? saved as EAS
             "https://yanglab.westlake.edu.cn/data/SMR/EAS.tar.gz",
             # mQTL summary data from a meta-analysis of samples of European ancestry. (3.7GB)
+            # no particular tissue? saved as EUR
             "https://yanglab.westlake.edu.cn/data/SMR/EUR.tar.gz",
             # Brain-mMeta mQTL summary data (Qi et al. 2018 Nat Commun) in SMR binary (BESD) format: Brain-mMeta.tar.gz (893 MB)
+            # brain (from meta-analysis)
             "https://yanglab.westlake.edu.cn/data/SMR/Brain-mMeta.tar.gz",
         ] + (
             # Lite version of the McRae et al. mQTL data (only SNPs with P < 1e-5 are included; 241 MB)
+            # peripheral blood
             ["https://yanglab.westlake.edu.cn/data/SMR/LBC_BSGS_meta_lite.tar.gz"]
             if lite
             else [
@@ -44,11 +49,9 @@ def download_smr_mqtl(lite):
             ]
         )
 
-        data_dir = path.join(path.dirname(path.dirname(__file__)), "data", "smr_eqtl")
+        data_dir = path.join(path.dirname(path.dirname(__file__)), "data", "smr_mqtl")
         Path(data_dir).mkdir(exist_ok=True, parents=True)
 
-        file_list.reverse()
-
         for file_url in file_list:
             filename = path.basename(file_url)
             _, ext = path.splitext(filename)
diff --git a/app/utils/smr.py b/app/utils/smr.py
new file mode 100644
index 0000000..908c73e
--- /dev/null
+++ b/app/utils/smr.py
@@ -0,0 +1,123 @@
+import os
+import re
+import subprocess
+from tempfile import NamedTemporaryFile
+from typing import List, TypedDict
+
+import pandas as pd
+
+curr_dir = os.path.dirname(__file__)
+data_dir = os.path.join(os.path.dirname(os.path.dirname(curr_dir)), "data", "smr_mqtl")
+
+
+class SMRDataset(TypedDict):
+    base_filename: str
+    by_chr: bool
+
+
+datasets: dict[str, SMRDataset] = {
+    "Brain-mMeta": {
+        "by_chr": False,
+        "base_filename": "Brain-mMeta",
+    },
+    "EAS": {
+        "by_chr": True,
+        "base_filename": "EAS",
+    },
+    "EUR": {
+        "by_chr": True,
+        "base_filename": "EUR",
+    },
+    "Hannon et al. Blood dataset1": {
+        "by_chr": False,
+        "base_filename": "Aberdeen_Blood",
+    },
+    "Hannon et al. Blood dataset2": {
+        "by_chr": False,
+        "base_filename": "UCL_Blood",
+    },
+    "Hannon et al. FetalBrain": {
+        "by_chr": False,
+        "base_filename": "FB_Brain",
+    },
+    # TODO: confirm
+    "LBC_BSGS_meta": {
+        "by_chr": True,
+        "base_filename": "bl_mqtl",
+    },
+    "LBC_BSGS_meta_lite": {
+        "by_chr": True,
+        "base_filename": "bl_mqtl_lite",
+    },
+    "US_mQTLS_SMR_format": {
+        "by_chr": False,
+        "base_filename": "US_Blood",
+    },
+}
+
+
+def query_smr(chr: int, snps: List[str], dataset: str, thresh: float = 5.0e-8):
+    """Query mqtl data in smr format
+
+    :param chr: The chromosome to query
+    :type chr: int
+    :param snps: A list of SNPS in format chr{chr}_{bp}_ref_alt
+    :type snps: List[str]
+    :param snps_rs: A list of the *same* SNPs in rsid format
+    :type snps_rs: List[str]
+    :param dataset: The dataset to query
+    :type dataset: str
+    :param thresh: The p-value threshold, defaults to 5e-8
+    :type thresh: float
+    :raises FileNotFoundError: If the dataset does not exist
+    """
+    if dataset not in datasets.keys():
+        raise FileNotFoundError(f"Dataset {dataset} does not exist!")
+    dataset_dir = os.path.join(data_dir, dataset)
+    base_filepath = os.path.join(dataset_dir, dataset)
+    if datasets[dataset]["by_chr"]:
+        base_filepath = f"{base_filepath}_chr{chr}"
+
+    regex = r"_(\d+)_"
+
+    snp_poses = [int(re.findall(regex, snp)[0]) for snp in snps]
+
+    start = min(snp_poses) // 1000
+    end = max(snp_poses) // 1000 + 1
+
+    with NamedTemporaryFile("w") as f:
+        query = [
+            "smr",
+            "--beqtl-summary",
+            base_filepath,
+            "--query",
+            str(thresh),
+            "--snp-chr",
+            str(chr),
+            "--from-snp-kb",
+            str(start),
+            "--to-snp-kb",
+            str(end),
+            "--out",
+            f.name,
+        ]
+
+        subprocess.run(query, check=True)
+
+        res = pd.read_csv(f"{f.name}.txt", sep="\t")
+
+    res.sort_values(by="BP", inplace=True)
+    res["full_snp"] = res.apply(
+        lambda x: f"chr{str(x['Chr'])}"
+        + "_"
+        + str(x["BP"])
+        + "_"
+        + x["A1"]
+        + "_"
+        + x["A2"],
+        axis=1,
+    )
+
+    filtered = res[res["full_snp"].isin(snps)]
+
+    return filtered

From 5ece539cf1c8c6792584c8628152a60c69e606b4 Mon Sep 17 00:00:00 2001
From: cklamann <12862284+cklamann@users.noreply.github.com>
Date: Mon, 21 Jul 2025 11:26:43 -0400
Subject: [PATCH 3/7] add test for smr query

---
 .gitignore                    |  2 +-
 app/utils/smr.py              | 87 +++++++++++++++++++-----------
 tests/utils/test_query_smr.py | 99 +++++++++++++++++++++++++++++++++++
 3 files changed, 157 insertions(+), 31 deletions(-)
 create mode 100644 tests/utils/test_query_smr.py

diff --git a/.gitignore b/.gitignore
index 35223bf..d0124b2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -137,7 +137,7 @@ data/gwas.public.txt.gz
 data/uk_biobank/
 data/mQTL_eQTM_SNPs_SAKNORM_For_CoLoc*.txt
 data/Note.txt
-*test*
+misc/**/test*
 data/slc9a3_gwas_mqtl.html
 data/*lookup_table.txt.gz
 .Rproj.user
diff --git a/app/utils/smr.py b/app/utils/smr.py
index 908c73e..461b9a2 100644
--- a/app/utils/smr.py
+++ b/app/utils/smr.py
@@ -56,20 +56,65 @@ class SMRDataset(TypedDict):
 }
 
 
-def query_smr(chr: int, snps: List[str], dataset: str, thresh: float = 5.0e-8):
+def run_smr_query(
+    query_path: str, chr: int, thresh: float, start: int, end: int
+) -> pd.DataFrame:
+    """Query the data, save to a temporary file, and return as a dataframe
+
+    :param query_path: The path to the data to be queried
+    :type query_path: str
+    :param chr: The chromosome to query
+    :type chr: int
+    :param thresh: The p-value threshold
+    :type thresh: float
+    :param start: The start bp to query
+    :type start: int
+    :param end: Then end bp to query
+    :type end: int
+    :return: The returned SNPs and pvalues
+    :rtype: pd.DataFrame
+    """
+    with NamedTemporaryFile("w") as f:
+        query = [
+            "smr",
+            "--beqtl-summary",
+            query_path,
+            "--query",
+            str(thresh),
+            "--snp-chr",
+            str(chr),
+            "--from-snp-kb",
+            str(start),
+            "--to-snp-kb",
+            str(end),
+            "--out",
+            f.name,
+        ]
+
+        subprocess.run(query, check=True)
+
+        return pd.read_csv(f"{f.name}.txt", sep="\t")
+
+
+def query_smr(
+    chr: int, snps: List[str], dataset: str, thresh: float = 5.0e-8
+) -> pd.DataFrame:
     """Query mqtl data in smr format
 
     :param chr: The chromosome to query
     :type chr: int
     :param snps: A list of SNPS in format chr{chr}_{bp}_ref_alt
     :type snps: List[str]
-    :param snps_rs: A list of the *same* SNPs in rsid format
-    :type snps_rs: List[str]
     :param dataset: The dataset to query
     :type dataset: str
     :param thresh: The p-value threshold, defaults to 5e-8
     :type thresh: float
     :raises FileNotFoundError: If the dataset does not exist
+    :return: The SNPs and pvalues as a dataframe with the following columns:
+    'SNP', 'Chr', 'BP', 'A1', 'A2', 'Freq', 'Probe', 'Probe_Chr',
+    'Probe_bp', 'Gene', 'Orientation', 'b', 'SE', 'p', 'full_snp'
+    Note that 'full_snp' is a combined column that takes the same format as those in ``snps``
+    :rtype: pd.DataFrame
     """
     if dataset not in datasets.keys():
         raise FileNotFoundError(f"Dataset {dataset} does not exist!")
@@ -85,39 +130,21 @@ def query_smr(chr: int, snps: List[str], dataset: str, thresh: float = 5.0e-8):
     start = min(snp_poses) // 1000
     end = max(snp_poses) // 1000 + 1
 
-    with NamedTemporaryFile("w") as f:
-        query = [
-            "smr",
-            "--beqtl-summary",
-            base_filepath,
-            "--query",
-            str(thresh),
-            "--snp-chr",
-            str(chr),
-            "--from-snp-kb",
-            str(start),
-            "--to-snp-kb",
-            str(end),
-            "--out",
-            f.name,
-        ]
-
-        subprocess.run(query, check=True)
-
-        res = pd.read_csv(f"{f.name}.txt", sep="\t")
+    query_result = run_smr_query(
+        query_path=base_filepath, chr=chr, thresh=thresh, start=start, end=end
+    )
 
-    res.sort_values(by="BP", inplace=True)
-    res["full_snp"] = res.apply(
-        lambda x: f"chr{str(x['Chr'])}"
+    query_result["full_snp"] = query_result.apply(
+        lambda df: f"chr{str(df['Chr'])}"
         + "_"
-        + str(x["BP"])
+        + str(df["BP"])
         + "_"
-        + x["A1"]
+        + df["A1"]
         + "_"
-        + x["A2"],
+        + df["A2"],
         axis=1,
     )
 
-    filtered = res[res["full_snp"].isin(snps)]
+    filtered = query_result[query_result["full_snp"].isin(snps)]
 
     return filtered
diff --git a/tests/utils/test_query_smr.py b/tests/utils/test_query_smr.py
new file mode 100644
index 0000000..9a3cae3
--- /dev/null
+++ b/tests/utils/test_query_smr.py
@@ -0,0 +1,99 @@
+from unittest.mock import patch, Mock
+
+import pandas as pd
+
+from app.utils.smr import query_smr
+
+mock_result = pd.DataFrame(
+    [
+        [
+            "chr1:982513",
+            1,
+            982513,
+            "T",
+            "C",
+            0.074442,
+            "cg24669183",
+            1,
+            534242,
+            pd.NA,
+            "N",
+            0.127188,
+            0.058496,
+            0.029684,
+        ],
+        [
+            "chr1:982513",
+            1,
+            982513,
+            "T",
+            "C",
+            0.074442,
+            "cg12726839",
+            1,
+            845311,
+            pd.NA,
+            "N",
+            0.180720,
+            0.058765,
+            0.002103,
+        ],
+        [
+            "chr1:982513",
+            1,
+            982513,
+            "A",
+            "T",
+            0.074442,
+            "cg12726839",
+            1,
+            845311,
+            pd.NA,
+            "N",
+            0.180720,
+            0.058765,
+            0.002103,
+        ],
+    ],
+    columns=[
+        "SNP",
+        "Chr",
+        "BP",
+        "A1",
+        "A2",
+        "Freq",
+        "Probe",
+        "Probe_Chr",
+        "Probe_bp",
+        "Gene",
+        "Orientation",
+        "b",
+        "SE",
+        "p",
+    ],
+)
+
+
+@patch("app.utils.smr.run_smr_query", return_value=mock_result)
+def test_query_smr(mock: Mock):
+    """Test the query function with mock data, since smr files are not committed to source,
+    ensuring that filtering and query construction functions are correst.
+    """
+    chr = 1
+    snps = ["chr1_982513_T_C"]
+    dataset = "EUR"
+    thresh = 1
+
+    res = query_smr(chr, snps, dataset, thresh)
+
+    assert len(res) == 2
+
+    assert len(res["full_snp"].isin(snps)) == 2
+
+    mock.assert_called_once_with(
+        query_path="/code/data/smr_mqtl/EUR/EUR_chr1",
+        thresh=1,
+        chr=1,
+        start=982,
+        end=983,
+    )

From cd0778181be91ced67207ab6c3fa30220816739f Mon Sep 17 00:00:00 2001
From: cklamann <12862284+cklamann@users.noreply.github.com>
Date: Mon, 21 Jul 2025 11:44:19 -0400
Subject: [PATCH 4/7] add assembly to smr query

---
 app/utils/smr.py | 39 +++++++++++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/app/utils/smr.py b/app/utils/smr.py
index 461b9a2..6a78bac 100644
--- a/app/utils/smr.py
+++ b/app/utils/smr.py
@@ -2,7 +2,7 @@
 import re
 import subprocess
 from tempfile import NamedTemporaryFile
-from typing import List, TypedDict
+from typing import List, Literal, TypedDict
 
 import pandas as pd
 
@@ -11,45 +11,55 @@
 
 
 class SMRDataset(TypedDict):
+    assembly: Literal["hg19", "hg38"]
     base_filename: str
     by_chr: bool
 
 
-datasets: dict[str, SMRDataset] = {
+smr_datasets: dict[str, SMRDataset] = {
     "Brain-mMeta": {
+        "assembly": "hg38",
         "by_chr": False,
         "base_filename": "Brain-mMeta",
     },
     "EAS": {
+        "assembly": "hg38",
         "by_chr": True,
         "base_filename": "EAS",
     },
     "EUR": {
+        "assembly": "hg38",
         "by_chr": True,
         "base_filename": "EUR",
     },
     "Hannon et al. Blood dataset1": {
+        "assembly": "hg19",
         "by_chr": False,
         "base_filename": "Aberdeen_Blood",
     },
     "Hannon et al. Blood dataset2": {
+        "assembly": "hg19",
         "by_chr": False,
         "base_filename": "UCL_Blood",
     },
     "Hannon et al. FetalBrain": {
+        "assembly": "hg19",
         "by_chr": False,
         "base_filename": "FB_Brain",
     },
     # TODO: confirm
     "LBC_BSGS_meta": {
+        "assembly": "hg19",
         "by_chr": True,
         "base_filename": "bl_mqtl",
     },
     "LBC_BSGS_meta_lite": {
+        "assembly": "hg19",
         "by_chr": True,
         "base_filename": "bl_mqtl_lite",
     },
     "US_mQTLS_SMR_format": {
+        "assembly": "hg19",
         "by_chr": False,
         "base_filename": "US_Blood",
     },
@@ -97,7 +107,11 @@ def run_smr_query(
 
 
 def query_smr(
-    chr: int, snps: List[str], dataset: str, thresh: float = 5.0e-8
+    chr: int,
+    snps: List[str],
+    dataset: str,
+    thresh: float = 5.0e-8,
+    assembly: Literal["hg19", "hg38"] = "hg38",
 ) -> pd.DataFrame:
     """Query mqtl data in smr format
 
@@ -109,18 +123,27 @@ def query_smr(
     :type dataset: str
     :param thresh: The p-value threshold, defaults to 5e-8
     :type thresh: float
+    :param assembly: The genome assembly to use, defaults to "hg38"
+    :type assembly: Literal["hg19", "hg38"]
     :raises FileNotFoundError: If the dataset does not exist
+    :raises ValueError: If the requested assembly does not match the dataset assembly
     :return: The SNPs and pvalues as a dataframe with the following columns:
     'SNP', 'Chr', 'BP', 'A1', 'A2', 'Freq', 'Probe', 'Probe_Chr',
     'Probe_bp', 'Gene', 'Orientation', 'b', 'SE', 'p', 'full_snp'
     Note that 'full_snp' is a combined column that takes the same format as those in ``snps``
     :rtype: pd.DataFrame
     """
-    if dataset not in datasets.keys():
+    if dataset not in smr_datasets.keys():
         raise FileNotFoundError(f"Dataset {dataset} does not exist!")
+
+    if smr_datasets[dataset]["assembly"] != assembly:
+        raise ValueError(
+            f"Dataset {dataset} uses {smr_datasets[dataset]['assembly']} but {assembly} was requested!"
+        )
+
     dataset_dir = os.path.join(data_dir, dataset)
     base_filepath = os.path.join(dataset_dir, dataset)
-    if datasets[dataset]["by_chr"]:
+    if smr_datasets[dataset]["by_chr"]:
         base_filepath = f"{base_filepath}_chr{chr}"
 
     regex = r"_(\d+)_"
@@ -131,7 +154,11 @@ def query_smr(
     end = max(snp_poses) // 1000 + 1
 
     query_result = run_smr_query(
-        query_path=base_filepath, chr=chr, thresh=thresh, start=start, end=end
+        query_path=base_filepath,
+        chr=chr,
+        thresh=thresh,
+        start=start,
+        end=end,
     )
 
     query_result["full_snp"] = query_result.apply(

From e32b1bfd02113c5b735e0ae14412ff28e423992d Mon Sep 17 00:00:00 2001
From: cklamann <12862284+cklamann@users.noreply.github.com>
Date: Wed, 23 Jul 2025 11:57:06 -0400
Subject: [PATCH 5/7] confirm lbc_bsgs filename

---
 app/utils/smr.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/app/utils/smr.py b/app/utils/smr.py
index 6a78bac..ab6395b 100644
--- a/app/utils/smr.py
+++ b/app/utils/smr.py
@@ -47,7 +47,6 @@ class SMRDataset(TypedDict):
         "by_chr": False,
         "base_filename": "FB_Brain",
     },
-    # TODO: confirm
     "LBC_BSGS_meta": {
         "assembly": "hg19",
         "by_chr": True,

From 92e8b0e68f356473f838815477f1bbb915c80069 Mon Sep 17 00:00:00 2001
From: cklamann <12862284+cklamann@users.noreply.github.com>
Date: Tue, 26 Aug 2025 13:13:53 -0400
Subject: [PATCH 6/7] add assembly to smr test

---
 tests/utils/test_query_smr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/utils/test_query_smr.py b/tests/utils/test_query_smr.py
index 9a3cae3..34a113a 100644
--- a/tests/utils/test_query_smr.py
+++ b/tests/utils/test_query_smr.py
@@ -84,7 +84,7 @@ def test_query_smr(mock: Mock):
     dataset = "EUR"
     thresh = 1
 
-    res = query_smr(chr, snps, dataset, thresh)
+    res = query_smr(chr, snps, dataset, thresh, "hg38")
 
     assert len(res) == 2
 

From 42de7bc990c446da015f7c40c9d0ec885f194517 Mon Sep 17 00:00:00 2001
From: Mackenzie Ian Frew <mackenzie.frew@sickkids.ca>
Date: Wed, 14 Jan 2026 15:12:52 -0500
Subject: [PATCH 7/7] Move SMR fetch to helper script

---
 Dockerfile         | 10 +---------
 setup/setup-smr.sh |  7 +++++++
 2 files changed, 8 insertions(+), 9 deletions(-)
 create mode 100644 setup/setup-smr.sh

diff --git a/Dockerfile b/Dockerfile
index 3dbf11e..b4e1643 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -21,8 +21,7 @@ RUN apt-get update && \
   r-base-dev \
   xauth \
   vim \
-  wget \
-  zip
+  wget
 
 # install plink
 RUN wget -O /tmp/plink.zip https://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20241022.zip \
@@ -39,13 +38,6 @@ RUN curl -f -L -O https://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/liftOve
   mkdir /usr/local/share/liftOver && \
   mv hg38ToHg19.over.chain.gz hg19ToHg38.over.chain.gz /usr/local/share/liftOver/
 
-# install smr for mqtl querying
-RUN wget https://yanglab.westlake.edu.cn/software/smr/download/smr-1.3.2-linux-x86_64.zip \
-  && unzip smr-1.3.2-linux-x86_64.zip \
-  && mv smr-1.3.2-linux-x86_64/smr /usr/local/bin/ \
-  && rm smr-1.3.2-linux-x86_64.zip
-
-
 # Install Poetry
 # https://github.com/python-poetry/poetry/issues/6397#issuecomment-1236327500
 ENV POETRY_HOME=/opt/poetry
diff --git a/setup/setup-smr.sh b/setup/setup-smr.sh
new file mode 100644
index 0000000..656fd2f
--- /dev/null
+++ b/setup/setup-smr.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+set -e
+
+wget https://yanglab.westlake.edu.cn/software/smr/download/smr-1.3.2-linux-x86_64.zip
+unzip smr-1.3.2-linux-x86_64.zip
+mv smr-1.3.2-linux-x86_64/smr misc/
+rm smr-1.3.2-linux-x86_64.zip