Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 17 additions & 14 deletions src/fcollections/sad/_gshhg.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
import logging
import tarfile
import typing as tp
from ftplib import FTP

import requests

from ._interface import IAuxiliaryDataFetcher

Expand Down Expand Up @@ -33,7 +34,7 @@ class GSHHG(IAuxiliaryDataFetcher):
the user home (~/.config/sad)
"""

FTP_URL = "ftp.soest.hawaii.edu"
HTTP_URL = "http://www.soest.hawaii.edu/pwessel"
FILE = "gshhg/gshhg-gmt-2.3.7.tar.gz"

@property
Expand All @@ -43,38 +44,40 @@ def keys(self) -> set[str]:
return {f"{s}_{r}" for s, r in itertools.product(subset, resolutions)}

def _download(self, remote_file: str, target_folder: Path):
fetch_ftp_file(self.FTP_URL, self.FILE, target_folder)
fetch_http_file(self.HTTP_URL, self.FILE, target_folder)
return target_folder / remote_file

def _file_name(self, key: str):
return f"binned_{key}.nc"


def fetch_ftp_file(url: str, filename: str, target_folder: Path):
def fetch_http_file(url: str, filename: str, target_folder: Path):

logger.debug("Connecting as anonymous to %s", url)
ftp = FTP(url)
ftp.login()
full_url = url + "/" + filename

# Download in-memory. This should be limited to a few MB
logger.info("Downloading %s...", filename)
logger.info("Downloading %s...", full_url)
tar_data = io.BytesIO()
ftp.retrbinary(f"RETR {filename}", tar_data.write)
ftp.quit()
logger.info("Downloading %s... Done", filename)
response = requests.get(full_url, timeout=60)
try:
response.raise_for_status()
except requests.exceptions.RequestException as e:
raise RuntimeError(f"Failed to download file from {full_url}") from e

# Filter out non-netcdf and flatten the tar gz structure
def tar_info_filter(tar_info: tarfile.TarInfo, _) -> tarfile.TarInfo | None:
if ".nc" not in tar_info.name:
logger.debug("Not an netcdf, skipping extraction")
return None

tar_info.name = tar_info.name.split("/")[-1]
logger.debug("Extracting %s", tar_info.name)
tar_info.path = tar_info.path.split("/")[-1]
return tar_info

# Extract in-memory buffer
tar_data.write(response.content)
tar_data.seek(0)
with tarfile.open(fileobj=tar_data, mode="r") as tar:
for member in tar.getmembers():
logger.debug("Extracting %s", member.name)
tar.extract(member, path=target_folder, filter=tar_info_filter)

logger.info("Downloading %s... Done", full_url)
40 changes: 29 additions & 11 deletions tests/sad/test_aux.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,23 +66,41 @@ def gshhg_tar_gz() -> bytes:
def test_gshhg(gshhg_tar_gz: bytes, tmp_path_factory: pytest.TempPathFactory):
path = tmp_path_factory.mktemp("sad")

def retrbinary_side_effect(_, callback):
callback(gshhg_tar_gz)
buf = io.BytesIO()
with tarfile.open(fileobj=buf, mode="w") as tar:
data = b"Hello World"
info = tarfile.TarInfo(name="binned_border_i.nc")
info.size = len(data)
tar.addfile(info, io.BytesIO(data))

ftp_mock = Mock()
ftp_mock.retrbinary.side_effect = retrbinary_side_effect
mock_resp = Mock()
mock_resp.content = buf.getvalue()
mock_resp.raise_for_status.return_value = None

aux = GSHHG()
with (
patch("fcollections.sad._gshhg.FTP", return_value=ftp_mock),
patch("requests.get", return_value=mock_resp) as get,
patch("fcollections.sad.GSHHG.lookup_folders", return_value=[path]),
):
fetched_file = aux["border_i"]
with open(fetched_file, "rb") as f:
assert f.read() == b"hello"

fetched_file = aux["GSHHS_h"]
with open(fetched_file, "rb") as f:
assert f.read() == b"world"
with open(fetched_file) as f:
assert f.read() == "Hello World"

ftp_mock.retrbinary.assert_called_once()
get.assert_called_once_with(
aux.HTTP_URL + "/gshhg/gshhg-gmt-2.3.7.tar.gz", timeout=60
)


def test_gshhg_http_error(tmp_path_factory: pytest.TempPathFactory):
path = tmp_path_factory.mktemp("sad")
mock_resp = Mock()
mock_resp.raise_for_status.side_effect = requests.HTTPError("404")

aux = GSHHG()
with (
patch("fcollections.sad._gshhg.requests.get", return_value=mock_resp),
patch("fcollections.sad.GSHHG.lookup_folders", return_value=[path]),
):
with pytest.raises(RuntimeError):
aux["border_i"]
Loading