Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions synapseclient/core/download/download_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,18 @@ def download_fn(
):
span.set_attribute("synapse.storage.provider", "s3")

# Reuse the pre-signed URL already retrieved above rather than
# requesting a second one inside the multi-threaded downloader. The
# file_handle_id/object_id/object_type are still passed so the
# downloader can refetch the URL if it expires mid-download.
presigned_url_info = PresignedUrlInfo(
file_name=file_handle["fileName"],
url=file_handle_result["preSignedURL"],
expiration_utc=_pre_signed_url_expiration_time(
file_handle_result["preSignedURL"]
),
)

# run the download multi threaded if the file supports it, we're configured to do so,
# and the file is large enough that it would be broken into parts to take advantage of
# multiple downloading threads. otherwise it's more efficient to run the download as a simple
Expand All @@ -612,6 +624,7 @@ def download_fn(
object_type=entity_type,
destination=destination,
expected_md5=actual_md5,
presigned_url=presigned_url_info,
synapse_client=syn,
)

Expand Down Expand Up @@ -738,7 +751,13 @@ async def download_from_url_multi_threaded(
# If the destination is a directory, then the file name should be the same as the file name in the presigned url
# This is added to ensure the temp file can be copied to the desired destination without changing the file name
destination = os.path.join(destination, presigned_url.file_name)
# Pass through the file handle identifiers so the downloader can refetch a
# fresh pre-signed URL if the seeded one expires mid-download. Callers that
# cannot refetch (e.g. wiki attachments) omit these, leaving them as None.
request = DownloadRequest(

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you clarify or add a comment about the asymmetry here? We are doing:

        request = DownloadRequest(
            file_handle_id=int(file_handle_id) if file_handle_id is not None else None,
            object_id=object_id,
            object_type=object_type,
            path=temp_destination,
            debug=client.debug,
            presigned_url=presigned_url,
        )

But in the else statement, I saw:

        request = DownloadRequest(
            file_handle_id=int(file_handle_id),
            object_id=object_id,
            object_type=object_type,
            path=temp_destination,
            debug=client.debug,
        )

I am curious why None guard is only needed in the if presigned_url is not None branch? Is it because file_handle_id is guaranteed non-None in else?

file_handle_id=int(file_handle_id) if file_handle_id is not None else None,
Comment on lines +754 to +758
object_id=object_id,
object_type=object_type,
path=temp_destination,
debug=client.debug,
presigned_url=presigned_url,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,84 @@ async def test_pre_signed_url_expiration_time(self) -> None:
assert expected == download_async._pre_signed_url_expiration_time(url)


class TestSeededPresignedUrlProvider:
"""Tests that a PresignedUrlProvider seeded with an already-fetched URL reuses
it on the happy path but still refetches (using the request's file handle
identifiers) once the seeded URL nears expiry mid-download."""

@pytest.fixture(scope="function", autouse=True)
def setup_method(self) -> None:
self.mock_synapse_client = mock.create_autospec(Synapse)
# Request carries the file handle identifiers so a refetch is possible.
self.download_request = DownloadRequest(
file_handle_id=123,
object_id="syn456",
object_type="FileEntity",
path="/myFakepath",
)

async def test_seeded_url_reused_without_refetch(self) -> None:
"""A seeded, unexpired URL is returned without calling the batch endpoint."""
utc_now = datetime.datetime.now(tz=datetime.timezone.utc)
seeded = PresignedUrlInfo(
file_name="myFile.txt",
url="https://synapse.org/somefile.txt",
expiration_utc=utc_now + datetime.timedelta(hours=1),
)

with mock.patch(
"synapseclient.core.download.download_async.get_file_handle_for_download",
) as mock_get_file_handle:
provider = PresignedUrlProvider(
self.mock_synapse_client,
request=self.download_request,
_cached_info=seeded,
)
assert seeded == provider.get_info()
# No refetch on the happy path.
mock_get_file_handle.assert_not_called()

async def test_seeded_url_refetched_on_expiry_with_identifiers(self) -> None:
"""When the seeded URL is near expiry, the provider refetches using the
request's file handle identifiers (not None)."""
utc_now = datetime.datetime.now(tz=datetime.timezone.utc)
# Seeded URL expires in the past -> triggers a refetch.
expired_seed = PresignedUrlInfo(
file_name="myFile.txt",
url="https://synapse.org/somefile.txt",
expiration_utc=utc_now - datetime.timedelta(seconds=5),
)
fresh_date = utc_now + datetime.timedelta(hours=1)
refetched_url = (
"https://synapse.org/refetched.txt"
f"?X-Amz-Date={fresh_date.strftime('%Y%m%dT%H%M%SZ')}"
"&X-Amz-Expires=3600&X-Amz-Signature=123456"
)

with mock.patch(
"synapseclient.core.download.download_async.get_file_handle_for_download",
return_value={
"fileHandle": {"fileName": "myFile.txt"},
"preSignedURL": refetched_url,
},
) as mock_get_file_handle:
provider = PresignedUrlProvider(
self.mock_synapse_client,
request=self.download_request,
_cached_info=expired_seed,
)
info = provider.get_info()

# A single refetch occurred, keyed on the request's identifiers.
mock_get_file_handle.assert_called_once_with(
file_handle_id=123,
synapse_id="syn456",
entity_type="FileEntity",
synapse_client=self.mock_synapse_client,
)
assert info.url == refetched_url


async def test_generate_chunk_ranges() -> None:
# test using smaller chunk size
download_async.SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE = 8
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,38 @@
"""Unit tests for synapseclient.core.download.download_functions"""

import datetime
import os
from unittest.mock import patch
from unittest.mock import AsyncMock, patch

import pytest

from synapseclient import File, Synapse
from synapseclient.core import utils
from synapseclient.core.constants import concrete_types
from synapseclient.core.download import (
PresignedUrlInfo,
download_file_entity,
download_functions,
ensure_download_location_is_directory,
)
from synapseclient.core.download.download_async import (
SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE,
)


def _presigned_url(seconds_until_expiry: int = 3600) -> str:
"""Build a syntactically valid AWS pre-signed URL that expires in the future."""
made = datetime.datetime.now(tz=datetime.timezone.utc) - datetime.timedelta(
seconds=1
)
return (
"https://s3.amazonaws.com/proddata/bucket/key.txt"
"?X-Amz-Algorithm=AWS4-HMAC-SHA256"
f"&X-Amz-Date={made.strftime('%Y%m%dT%H%M%SZ')}"
f"&X-Amz-Expires={seconds_until_expiry}"
"&X-Amz-SignedHeaders=host"
"&X-Amz-Signature=signature-value"
)


def test_ensure_download_location_is_directory() -> None:
Expand Down Expand Up @@ -40,3 +62,145 @@ async def test_download_file_entity_correct_local_state(syn: Synapse) -> None:
assert os.path.dirname(mock_cache_path) == file_entity.cacheDir
assert 1 == len(file_entity.files)
assert os.path.basename(mock_cache_path) == file_entity.files[0]


class TestMultiThreadedPresignedUrlReuse:
"""Tests that a multi-threaded download reuses the pre-signed URL already
fetched in ``download_by_file_handle`` instead of requesting a second one,
while still passing the file handle identifiers needed to refetch on expiry.
"""

@pytest.fixture(autouse=True)
def _large_s3_file_handle(self, monkeypatch: pytest.MonkeyPatch) -> dict:
"""A file handle for a large S3 file that routes to the multi-threaded path."""
self.presigned_url = _presigned_url()
self.file_handle_result = {
"preSignedURL": self.presigned_url,
"fileHandle": {
"id": "9999",
"fileName": "big_file.txt",
"concreteType": concrete_types.S3_FILE_HANDLE,
"contentSize": SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE + 1,
"contentMd5": "abc123",
"storageLocationId": 1,
},
}
return self.file_handle_result

async def test_multi_threaded_download_fetches_url_once_and_forwards_it(
self, syn: Synapse, monkeypatch: pytest.MonkeyPatch, tmp_path
) -> None:
"""The metadata/pre-signed batch endpoint is hit exactly once and the
resulting URL is forwarded to the multi-threaded downloader (so it does
not request a second one)."""
monkeypatch.setattr(syn, "multi_threaded", True)
destination = str(tmp_path / "big_file.txt")

with (
patch.object(
download_functions,
"get_file_handle_for_download_async",
new=AsyncMock(return_value=self.file_handle_result),
) as mock_get_handle,
patch.object(
download_functions,
"download_from_url_multi_threaded",
new=AsyncMock(return_value=destination),
) as mock_multi_threaded,
patch.object(
download_functions.sts_transfer,
"is_boto_sts_transfer_enabled",
return_value=False,
),
patch.object(syn.cache, "add"),
):
await download_functions.download_by_file_handle(
file_handle_id="9999",
synapse_id="syn123",
entity_type="FileEntity",
destination=destination,
synapse_client=syn,
)

# The pre-signed URL / metadata batch call is made exactly once.
assert mock_get_handle.call_count == 1

# The already-fetched URL is forwarded so the downloader does not fetch again.
assert mock_multi_threaded.call_count == 1
forwarded = mock_multi_threaded.call_args.kwargs["presigned_url"]
assert isinstance(forwarded, PresignedUrlInfo)
assert forwarded.url == self.presigned_url
assert forwarded.file_name == "big_file.txt"

# File handle identifiers are still passed so a refetch is possible on expiry.
assert mock_multi_threaded.call_args.kwargs["file_handle_id"] == "9999"
assert mock_multi_threaded.call_args.kwargs["object_id"] == "syn123"
assert mock_multi_threaded.call_args.kwargs["object_type"] == "FileEntity"


class TestDownloadFromUrlMultiThreadedRequest:
"""Tests for the DownloadRequest constructed by download_from_url_multi_threaded."""

async def test_seeded_url_request_carries_file_handle_identifiers(
self, syn: Synapse, tmp_path
) -> None:
"""When a pre-signed URL is supplied along with file handle identifiers,
the DownloadRequest carries both so the downloader can refetch on expiry."""
presigned = PresignedUrlInfo(
file_name="big_file.txt",
url=_presigned_url(),
expiration_utc=datetime.datetime.now(tz=datetime.timezone.utc)
+ datetime.timedelta(hours=1),
)
captured = {}

async def _capture(client, download_request) -> None:
captured["request"] = download_request
# Create the temp file so the post-download move succeeds.
open(download_request.path, "wb").close()

with patch.object(download_functions, "download_file", new=_capture):
await download_functions.download_from_url_multi_threaded(
destination=str(tmp_path / "big_file.txt"),
file_handle_id="9999",
object_id="syn123",
object_type="FileEntity",
presigned_url=presigned,
synapse_client=syn,
)

request = captured["request"]
assert request.presigned_url is presigned
assert request.file_handle_id == 9999
assert request.object_id == "syn123"
assert request.object_type == "FileEntity"

async def test_seeded_url_request_without_identifiers_preserves_wiki_behavior(
self, syn: Synapse, tmp_path
) -> None:
"""Wiki callers pass a pre-signed URL without file handle identifiers.
These must remain None (and int(None) must not be attempted)."""
presigned = PresignedUrlInfo(
file_name="attachment.txt",
url=_presigned_url(),
expiration_utc=datetime.datetime.now(tz=datetime.timezone.utc)
+ datetime.timedelta(hours=1),
)
captured = {}

async def _capture(client, download_request) -> None:
captured["request"] = download_request
open(download_request.path, "wb").close()

with patch.object(download_functions, "download_file", new=_capture):
await download_functions.download_from_url_multi_threaded(
destination=str(tmp_path),
presigned_url=presigned,
synapse_client=syn,
)

request = captured["request"]
assert request.presigned_url is presigned
assert request.file_handle_id is None
assert request.object_id is None
assert request.object_type is None
24 changes: 24 additions & 0 deletions tests/unit/synapseclient/core/unit_test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from synapseclient.core.download import (
DownloadRequest,
PresignedUrlInfo,
_pre_signed_url_expiration_time,
download_by_file_handle,
download_file,
download_from_url,
Expand Down Expand Up @@ -435,11 +436,23 @@ async def test_mock_download(syn: Synapse) -> None:
class TestDownloadFileHandle:
mock_file_handle = {
"id": "123",
"fileName": "foo.txt",
"concreteType": concrete_types.S3_FILE_HANDLE,
"contentMd5": "someMD5",
"contentSize": download_async.SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE + 1,
}

# A syntactically valid AWS pre-signed URL that expires well in the future, so
# _pre_signed_url_expiration_time can parse it.
mock_presigned_url = (
"https://s3.amazonaws.com/proddata/bucket/foo.txt"
"?X-Amz-Algorithm=AWS4-HMAC-SHA256"
"&X-Amz-Date=20200101T000000Z"
"&X-Amz-Expires=999999999"
"&X-Amz-SignedHeaders=host"
"&X-Amz-Signature=signature-value"
)

@pytest_asyncio.fixture(autouse=True, loop_scope="function", scope="function")
def init_syn(self, syn: Synapse) -> None:
self.syn = syn
Expand All @@ -463,6 +476,7 @@ async def test_multithread_true_s3_file_handle(self) -> None:
):
mock_getFileHandleDownload.return_value = {
"fileHandle": self.mock_file_handle,
"preSignedURL": self.mock_presigned_url,
}

self.syn.multi_threaded = True
Expand All @@ -474,12 +488,22 @@ async def test_multithread_true_s3_file_handle(self) -> None:
synapse_client=self.syn,
)

# The pre-signed URL already retrieved is forwarded to the multi-threaded
# downloader (so it is not fetched a second time), while the file handle
# identifiers are still passed so it can refetch the URL if it expires.
mock_multi_thread_download.assert_called_once_with(
file_handle_id=123,
object_id=456,
object_type="FileEntity",
destination="/myfakepath",
expected_md5="someMD5",
presigned_url=PresignedUrlInfo(
file_name="foo.txt",
url=self.mock_presigned_url,
expiration_utc=_pre_signed_url_expiration_time(
self.mock_presigned_url
),
),
synapse_client=self.syn,
)

Expand Down
Loading