Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ data/gwas.public.txt.gz
data/uk_biobank/
data/mQTL_eQTM_SNPs_SAKNORM_For_CoLoc*.txt
data/Note.txt
misc/**/test*
data/slc9a3_gwas_mqtl.html
data/*lookup_table.txt.gz
.Rproj.user
Expand Down
2 changes: 2 additions & 0 deletions app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from flask_sitemap import Sitemap
from flask_talisman import Talisman

from app.commands import register_cli
from app.config import BaseConfig, DevConfig, ProdConfig
from app.cache import cache

Expand Down Expand Up @@ -66,6 +67,7 @@ def create_app(config_class=ConfigClass):
mongo.init_app(app)
celery_init_app(app)
cache.init_app(app)
register_cli(app)

if app.config["CACHE_TYPE"] == "NullCache":
app.logger.debug("Cache is disabled")
Expand Down
82 changes: 82 additions & 0 deletions app/commands.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from os import path
from pathlib import Path
import shutil
import tarfile
import zipfile

import click
from flask import Flask, current_app
import requests
import tqdm


def register_cli(app: Flask):
@app.cli.command(name="download-smr-mqtl", help="Download mQTL files")
@click.option(
"--lite",
is_flag=True,
help="Download 'lite' version of the McRae et al. mQTL data (only SNPs with P < 1e-5 are included; 241 MB)",
)
def download_smr_mqtl(lite):
# https://yanglab.westlake.edu.cn/software/smr/#mQTLsummarydata
file_list = [
# Whole blood mQTL data set used in Hannon et al. (2018 AJHG).(121MB)
# Saved as US_mQTLS_SMR_format
"https://yanglab.westlake.edu.cn/data/SMR/US_mQTLS_SMR_format.zip",
# 42MB
"https://yanglab.westlake.edu.cn/data/SMR/Hannon_Blood_dataset1.zip",
# 25MB
"https://yanglab.westlake.edu.cn/data/SMR/Hannon_Blood_dataset2.zip",
# https://yanglab.westlake.edu.cn/data/SMR/Hannon_FetalBrain.zip (4.8MB)
"https://yanglab.westlake.edu.cn/data/SMR/Hannon_FetalBrain.zip",
# mQTL summary data from a meta-analysis of samples of East Asian ancestry. (2.5GB)
# no particular tissue? saved as EAS
"https://yanglab.westlake.edu.cn/data/SMR/EAS.tar.gz",
# mQTL summary data from a meta-analysis of samples of European ancestry. (3.7GB)
# no particular tissue? saved as EUR
"https://yanglab.westlake.edu.cn/data/SMR/EUR.tar.gz",
# Brain-mMeta mQTL summary data (Qi et al. 2018 Nat Commun) in SMR binary (BESD) format: Brain-mMeta.tar.gz (893 MB)
# brain (from meta-analysis)
"https://yanglab.westlake.edu.cn/data/SMR/Brain-mMeta.tar.gz",
] + (
# Lite version of the McRae et al. mQTL data (only SNPs with P < 1e-5 are included; 241 MB)
# peripheral blood
["https://yanglab.westlake.edu.cn/data/SMR/LBC_BSGS_meta_lite.tar.gz"]
if lite
else [
# McRae et al. mQTL summary data (7.5 GB)
"https://yanglab.westlake.edu.cn/data/SMR/LBC_BSGS_meta.tar.gz",
]
)

data_dir = path.join(path.dirname(path.dirname(__file__)), "data", "smr_mqtl")
Path(data_dir).mkdir(exist_ok=True, parents=True)

for file_url in file_list:
filename = path.basename(file_url)
_, ext = path.splitext(filename)
tmp_save_path = path.join(data_dir, filename)
with requests.get(file_url, stream=True) as r:
current_app.logger.info(f"Downloading {filename}...")
if r.status_code != 200:
r.raise_for_status()
raise RuntimeError(
f"Request to {file_url} returned status code {r.status_code}"
)
file_size = int(r.headers.get("Content-Length", 0))
desc = "(Unknown total file size)" if file_size == 0 else ""
with tqdm.tqdm.wrapattr(
r.raw, "read", total=file_size, desc=desc
) as r_raw:
with open(tmp_save_path, "wb") as fd:
shutil.copyfileobj(r_raw, fd)

if ext == ".zip":
with zipfile.ZipFile(tmp_save_path) as zf:
zf.extractall(data_dir)
Path(tmp_save_path).unlink()
elif ext == ".gz":
with tarfile.open(tmp_save_path) as tf:
tf.extractall(data_dir, filter="data")
Path(tmp_save_path).unlink()
tf.close()
Comment on lines +20 to +82

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Likewise here, we cannot fetch this data live on the SickKids network due to the Chinese IP restriction.

176 changes: 176 additions & 0 deletions app/utils/smr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import os
import re
import subprocess
from tempfile import NamedTemporaryFile
from typing import List, Literal, TypedDict

import pandas as pd

curr_dir = os.path.dirname(__file__)
data_dir = os.path.join(os.path.dirname(os.path.dirname(curr_dir)), "data", "smr_mqtl")


class SMRDataset(TypedDict):
assembly: Literal["hg19", "hg38"]
base_filename: str
by_chr: bool


smr_datasets: dict[str, SMRDataset] = {
"Brain-mMeta": {
"assembly": "hg38",
"by_chr": False,
"base_filename": "Brain-mMeta",
},
"EAS": {
"assembly": "hg38",
"by_chr": True,
"base_filename": "EAS",
},
"EUR": {
"assembly": "hg38",
"by_chr": True,
"base_filename": "EUR",
},
"Hannon et al. Blood dataset1": {
"assembly": "hg19",
"by_chr": False,
"base_filename": "Aberdeen_Blood",
},
"Hannon et al. Blood dataset2": {
"assembly": "hg19",
"by_chr": False,
"base_filename": "UCL_Blood",
},
"Hannon et al. FetalBrain": {
"assembly": "hg19",
"by_chr": False,
"base_filename": "FB_Brain",
},
"LBC_BSGS_meta": {
"assembly": "hg19",
"by_chr": True,
"base_filename": "bl_mqtl",
},
"LBC_BSGS_meta_lite": {
"assembly": "hg19",
"by_chr": True,
"base_filename": "bl_mqtl_lite",
},
"US_mQTLS_SMR_format": {
"assembly": "hg19",
"by_chr": False,
"base_filename": "US_Blood",
},
}


def run_smr_query(
query_path: str, chr: int, thresh: float, start: int, end: int
) -> pd.DataFrame:
"""Query the data, save to a temporary file, and return as a dataframe

:param query_path: The path to the data to be queried
:type query_path: str
:param chr: The chromosome to query
:type chr: int
:param thresh: The p-value threshold
:type thresh: float
:param start: The start bp to query
:type start: int
:param end: Then end bp to query
:type end: int
:return: The returned SNPs and pvalues
:rtype: pd.DataFrame
"""
with NamedTemporaryFile("w") as f:
query = [
"smr",
"--beqtl-summary",
query_path,
"--query",
str(thresh),
"--snp-chr",
str(chr),
"--from-snp-kb",
str(start),
"--to-snp-kb",
str(end),
"--out",
f.name,
]

subprocess.run(query, check=True)

return pd.read_csv(f"{f.name}.txt", sep="\t")


def query_smr(
chr: int,
snps: List[str],
dataset: str,
thresh: float = 5.0e-8,
assembly: Literal["hg19", "hg38"] = "hg38",
) -> pd.DataFrame:
"""Query mqtl data in smr format

:param chr: The chromosome to query
:type chr: int
:param snps: A list of SNPS in format chr{chr}_{bp}_ref_alt
:type snps: List[str]
:param dataset: The dataset to query
:type dataset: str
:param thresh: The p-value threshold, defaults to 5e-8
:type thresh: float
:param assembly: The genome assembly to use, defaults to "hg38"
:type assembly: Literal["hg19", "hg38"]
:raises FileNotFoundError: If the dataset does not exist
:raises ValueError: If the requested assembly does not match the dataset assembly
:return: The SNPs and pvalues as a dataframe with the following columns:
'SNP', 'Chr', 'BP', 'A1', 'A2', 'Freq', 'Probe', 'Probe_Chr',
'Probe_bp', 'Gene', 'Orientation', 'b', 'SE', 'p', 'full_snp'
Note that 'full_snp' is a combined column that takes the same format as those in ``snps``
:rtype: pd.DataFrame
"""
if dataset not in smr_datasets.keys():
raise FileNotFoundError(f"Dataset {dataset} does not exist!")

if smr_datasets[dataset]["assembly"] != assembly:
raise ValueError(
f"Dataset {dataset} uses {smr_datasets[dataset]['assembly']} but {assembly} was requested!"
)

dataset_dir = os.path.join(data_dir, dataset)
base_filepath = os.path.join(dataset_dir, dataset)
if smr_datasets[dataset]["by_chr"]:
base_filepath = f"{base_filepath}_chr{chr}"

regex = r"_(\d+)_"

snp_poses = [int(re.findall(regex, snp)[0]) for snp in snps]

start = min(snp_poses) // 1000
end = max(snp_poses) // 1000 + 1

query_result = run_smr_query(
query_path=base_filepath,
chr=chr,
thresh=thresh,
start=start,
end=end,
)

query_result["full_snp"] = query_result.apply(
lambda df: f"chr{str(df['Chr'])}"
+ "_"
+ str(df["BP"])
+ "_"
+ df["A1"]
+ "_"
+ df["A2"],
axis=1,
)

filtered = query_result[query_result["full_snp"].isin(snps)]

return filtered
7 changes: 7 additions & 0 deletions setup/setup-smr.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash
set -e

wget https://yanglab.westlake.edu.cn/software/smr/download/smr-1.3.2-linux-x86_64.zip
unzip smr-1.3.2-linux-x86_64.zip
mv smr-1.3.2-linux-x86_64/smr misc/
rm smr-1.3.2-linux-x86_64.zip
99 changes: 99 additions & 0 deletions tests/utils/test_query_smr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from unittest.mock import patch, Mock

import pandas as pd

from app.utils.smr import query_smr

mock_result = pd.DataFrame(
[
[
"chr1:982513",
1,
982513,
"T",
"C",
0.074442,
"cg24669183",
1,
534242,
pd.NA,
"N",
0.127188,
0.058496,
0.029684,
],
[
"chr1:982513",
1,
982513,
"T",
"C",
0.074442,
"cg12726839",
1,
845311,
pd.NA,
"N",
0.180720,
0.058765,
0.002103,
],
[
"chr1:982513",
1,
982513,
"A",
"T",
0.074442,
"cg12726839",
1,
845311,
pd.NA,
"N",
0.180720,
0.058765,
0.002103,
],
],
columns=[
"SNP",
"Chr",
"BP",
"A1",
"A2",
"Freq",
"Probe",
"Probe_Chr",
"Probe_bp",
"Gene",
"Orientation",
"b",
"SE",
"p",
],
)


@patch("app.utils.smr.run_smr_query", return_value=mock_result)
def test_query_smr(mock: Mock):
"""Test the query function with mock data, since smr files are not committed to source,
ensuring that filtering and query construction functions are correst.
"""
chr = 1
snps = ["chr1_982513_T_C"]
dataset = "EUR"
thresh = 1

res = query_smr(chr, snps, dataset, thresh, "hg38")

assert len(res) == 2

assert len(res["full_snp"].isin(snps)) == 2

mock.assert_called_once_with(
query_path="/code/data/smr_mqtl/EUR/EUR_chr1",
thresh=1,
chr=1,
start=982,
end=983,
)