From b9bd39ef0082884a23aab6f004e623269f732e04 Mon Sep 17 00:00:00 2001 From: BryanFauble <17128019+BryanFauble@users.noreply.github.com> Date: Fri, 12 Jun 2026 16:58:57 +0000 Subject: [PATCH] Reuse pre-signed URL for multi-threaded downloads Multi-threaded downloads (files > 8 MB) requested two pre-signed URLs: one in download_by_file_handle to choose the download strategy, and a second inside the multi-threaded downloader. The first URL's preSignedURL field was discarded, doubling the pre-signed URL count (and download metrics) for every large-file download. Forward the already-retrieved URL into download_from_url_multi_threaded so the downloader reuses it instead of fetching a second one. The file handle identifiers (file_handle_id/object_id/object_type) are now passed alongside the seeded URL so PresignedUrlProvider can still refetch a fresh URL if it expires mid-download. Wiki callers omit these identifiers, preserving their existing no-refetch behavior. --- .../core/download/download_functions.py | 19 ++ .../core/download/unit_test_download_async.py | 78 ++++++++ .../download/unit_test_download_functions.py | 166 +++++++++++++++++- .../synapseclient/core/unit_test_download.py | 24 +++ 4 files changed, 286 insertions(+), 1 deletion(-) diff --git a/synapseclient/core/download/download_functions.py b/synapseclient/core/download/download_functions.py index 5595359a4..598d2e968 100644 --- a/synapseclient/core/download/download_functions.py +++ b/synapseclient/core/download/download_functions.py @@ -602,6 +602,18 @@ def download_fn( ): span.set_attribute("synapse.storage.provider", "s3") + # Reuse the pre-signed URL already retrieved above rather than + # requesting a second one inside the multi-threaded downloader. The + # file_handle_id/object_id/object_type are still passed so the + # downloader can refetch the URL if it expires mid-download. + presigned_url_info = PresignedUrlInfo( + file_name=file_handle["fileName"], + url=file_handle_result["preSignedURL"], + expiration_utc=_pre_signed_url_expiration_time( + file_handle_result["preSignedURL"] + ), + ) + # run the download multi threaded if the file supports it, we're configured to do so, # and the file is large enough that it would be broken into parts to take advantage of # multiple downloading threads. otherwise it's more efficient to run the download as a simple @@ -612,6 +624,7 @@ def download_fn( object_type=entity_type, destination=destination, expected_md5=actual_md5, + presigned_url=presigned_url_info, synapse_client=syn, ) @@ -738,7 +751,13 @@ async def download_from_url_multi_threaded( # If the destination is a directory, then the file name should be the same as the file name in the presigned url # This is added to ensure the temp file can be copied to the desired destination without changing the file name destination = os.path.join(destination, presigned_url.file_name) + # Pass through the file handle identifiers so the downloader can refetch a + # fresh pre-signed URL if the seeded one expires mid-download. Callers that + # cannot refetch (e.g. wiki attachments) omit these, leaving them as None. request = DownloadRequest( + file_handle_id=int(file_handle_id) if file_handle_id is not None else None, + object_id=object_id, + object_type=object_type, path=temp_destination, debug=client.debug, presigned_url=presigned_url, diff --git a/tests/unit/synapseclient/core/download/unit_test_download_async.py b/tests/unit/synapseclient/core/download/unit_test_download_async.py index 587463baa..8b6b8f45e 100644 --- a/tests/unit/synapseclient/core/download/unit_test_download_async.py +++ b/tests/unit/synapseclient/core/download/unit_test_download_async.py @@ -152,6 +152,84 @@ async def test_pre_signed_url_expiration_time(self) -> None: assert expected == download_async._pre_signed_url_expiration_time(url) +class TestSeededPresignedUrlProvider: + """Tests that a PresignedUrlProvider seeded with an already-fetched URL reuses + it on the happy path but still refetches (using the request's file handle + identifiers) once the seeded URL nears expiry mid-download.""" + + @pytest.fixture(scope="function", autouse=True) + def setup_method(self) -> None: + self.mock_synapse_client = mock.create_autospec(Synapse) + # Request carries the file handle identifiers so a refetch is possible. + self.download_request = DownloadRequest( + file_handle_id=123, + object_id="syn456", + object_type="FileEntity", + path="/myFakepath", + ) + + async def test_seeded_url_reused_without_refetch(self) -> None: + """A seeded, unexpired URL is returned without calling the batch endpoint.""" + utc_now = datetime.datetime.now(tz=datetime.timezone.utc) + seeded = PresignedUrlInfo( + file_name="myFile.txt", + url="https://synapse.org/somefile.txt", + expiration_utc=utc_now + datetime.timedelta(hours=1), + ) + + with mock.patch( + "synapseclient.core.download.download_async.get_file_handle_for_download", + ) as mock_get_file_handle: + provider = PresignedUrlProvider( + self.mock_synapse_client, + request=self.download_request, + _cached_info=seeded, + ) + assert seeded == provider.get_info() + # No refetch on the happy path. + mock_get_file_handle.assert_not_called() + + async def test_seeded_url_refetched_on_expiry_with_identifiers(self) -> None: + """When the seeded URL is near expiry, the provider refetches using the + request's file handle identifiers (not None).""" + utc_now = datetime.datetime.now(tz=datetime.timezone.utc) + # Seeded URL expires in the past -> triggers a refetch. + expired_seed = PresignedUrlInfo( + file_name="myFile.txt", + url="https://synapse.org/somefile.txt", + expiration_utc=utc_now - datetime.timedelta(seconds=5), + ) + fresh_date = utc_now + datetime.timedelta(hours=1) + refetched_url = ( + "https://synapse.org/refetched.txt" + f"?X-Amz-Date={fresh_date.strftime('%Y%m%dT%H%M%SZ')}" + "&X-Amz-Expires=3600&X-Amz-Signature=123456" + ) + + with mock.patch( + "synapseclient.core.download.download_async.get_file_handle_for_download", + return_value={ + "fileHandle": {"fileName": "myFile.txt"}, + "preSignedURL": refetched_url, + }, + ) as mock_get_file_handle: + provider = PresignedUrlProvider( + self.mock_synapse_client, + request=self.download_request, + _cached_info=expired_seed, + ) + info = provider.get_info() + + # A single refetch occurred, keyed on the request's identifiers. + mock_get_file_handle.assert_called_once_with( + file_handle_id=123, + synapse_id="syn456", + entity_type="FileEntity", + synapse_client=self.mock_synapse_client, + ) + assert info.url == refetched_url + + async def test_generate_chunk_ranges() -> None: # test using smaller chunk size download_async.SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE = 8 diff --git a/tests/unit/synapseclient/core/download/unit_test_download_functions.py b/tests/unit/synapseclient/core/download/unit_test_download_functions.py index 839781142..c9e35a6e5 100644 --- a/tests/unit/synapseclient/core/download/unit_test_download_functions.py +++ b/tests/unit/synapseclient/core/download/unit_test_download_functions.py @@ -1,16 +1,38 @@ """Unit tests for synapseclient.core.download.download_functions""" +import datetime import os -from unittest.mock import patch +from unittest.mock import AsyncMock, patch import pytest from synapseclient import File, Synapse from synapseclient.core import utils +from synapseclient.core.constants import concrete_types from synapseclient.core.download import ( + PresignedUrlInfo, download_file_entity, + download_functions, ensure_download_location_is_directory, ) +from synapseclient.core.download.download_async import ( + SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE, +) + + +def _presigned_url(seconds_until_expiry: int = 3600) -> str: + """Build a syntactically valid AWS pre-signed URL that expires in the future.""" + made = datetime.datetime.now(tz=datetime.timezone.utc) - datetime.timedelta( + seconds=1 + ) + return ( + "https://s3.amazonaws.com/proddata/bucket/key.txt" + "?X-Amz-Algorithm=AWS4-HMAC-SHA256" + f"&X-Amz-Date={made.strftime('%Y%m%dT%H%M%SZ')}" + f"&X-Amz-Expires={seconds_until_expiry}" + "&X-Amz-SignedHeaders=host" + "&X-Amz-Signature=signature-value" + ) def test_ensure_download_location_is_directory() -> None: @@ -40,3 +62,145 @@ async def test_download_file_entity_correct_local_state(syn: Synapse) -> None: assert os.path.dirname(mock_cache_path) == file_entity.cacheDir assert 1 == len(file_entity.files) assert os.path.basename(mock_cache_path) == file_entity.files[0] + + +class TestMultiThreadedPresignedUrlReuse: + """Tests that a multi-threaded download reuses the pre-signed URL already + fetched in ``download_by_file_handle`` instead of requesting a second one, + while still passing the file handle identifiers needed to refetch on expiry. + """ + + @pytest.fixture(autouse=True) + def _large_s3_file_handle(self, monkeypatch: pytest.MonkeyPatch) -> dict: + """A file handle for a large S3 file that routes to the multi-threaded path.""" + self.presigned_url = _presigned_url() + self.file_handle_result = { + "preSignedURL": self.presigned_url, + "fileHandle": { + "id": "9999", + "fileName": "big_file.txt", + "concreteType": concrete_types.S3_FILE_HANDLE, + "contentSize": SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE + 1, + "contentMd5": "abc123", + "storageLocationId": 1, + }, + } + return self.file_handle_result + + async def test_multi_threaded_download_fetches_url_once_and_forwards_it( + self, syn: Synapse, monkeypatch: pytest.MonkeyPatch, tmp_path + ) -> None: + """The metadata/pre-signed batch endpoint is hit exactly once and the + resulting URL is forwarded to the multi-threaded downloader (so it does + not request a second one).""" + monkeypatch.setattr(syn, "multi_threaded", True) + destination = str(tmp_path / "big_file.txt") + + with ( + patch.object( + download_functions, + "get_file_handle_for_download_async", + new=AsyncMock(return_value=self.file_handle_result), + ) as mock_get_handle, + patch.object( + download_functions, + "download_from_url_multi_threaded", + new=AsyncMock(return_value=destination), + ) as mock_multi_threaded, + patch.object( + download_functions.sts_transfer, + "is_boto_sts_transfer_enabled", + return_value=False, + ), + patch.object(syn.cache, "add"), + ): + await download_functions.download_by_file_handle( + file_handle_id="9999", + synapse_id="syn123", + entity_type="FileEntity", + destination=destination, + synapse_client=syn, + ) + + # The pre-signed URL / metadata batch call is made exactly once. + assert mock_get_handle.call_count == 1 + + # The already-fetched URL is forwarded so the downloader does not fetch again. + assert mock_multi_threaded.call_count == 1 + forwarded = mock_multi_threaded.call_args.kwargs["presigned_url"] + assert isinstance(forwarded, PresignedUrlInfo) + assert forwarded.url == self.presigned_url + assert forwarded.file_name == "big_file.txt" + + # File handle identifiers are still passed so a refetch is possible on expiry. + assert mock_multi_threaded.call_args.kwargs["file_handle_id"] == "9999" + assert mock_multi_threaded.call_args.kwargs["object_id"] == "syn123" + assert mock_multi_threaded.call_args.kwargs["object_type"] == "FileEntity" + + +class TestDownloadFromUrlMultiThreadedRequest: + """Tests for the DownloadRequest constructed by download_from_url_multi_threaded.""" + + async def test_seeded_url_request_carries_file_handle_identifiers( + self, syn: Synapse, tmp_path + ) -> None: + """When a pre-signed URL is supplied along with file handle identifiers, + the DownloadRequest carries both so the downloader can refetch on expiry.""" + presigned = PresignedUrlInfo( + file_name="big_file.txt", + url=_presigned_url(), + expiration_utc=datetime.datetime.now(tz=datetime.timezone.utc) + + datetime.timedelta(hours=1), + ) + captured = {} + + async def _capture(client, download_request) -> None: + captured["request"] = download_request + # Create the temp file so the post-download move succeeds. + open(download_request.path, "wb").close() + + with patch.object(download_functions, "download_file", new=_capture): + await download_functions.download_from_url_multi_threaded( + destination=str(tmp_path / "big_file.txt"), + file_handle_id="9999", + object_id="syn123", + object_type="FileEntity", + presigned_url=presigned, + synapse_client=syn, + ) + + request = captured["request"] + assert request.presigned_url is presigned + assert request.file_handle_id == 9999 + assert request.object_id == "syn123" + assert request.object_type == "FileEntity" + + async def test_seeded_url_request_without_identifiers_preserves_wiki_behavior( + self, syn: Synapse, tmp_path + ) -> None: + """Wiki callers pass a pre-signed URL without file handle identifiers. + These must remain None (and int(None) must not be attempted).""" + presigned = PresignedUrlInfo( + file_name="attachment.txt", + url=_presigned_url(), + expiration_utc=datetime.datetime.now(tz=datetime.timezone.utc) + + datetime.timedelta(hours=1), + ) + captured = {} + + async def _capture(client, download_request) -> None: + captured["request"] = download_request + open(download_request.path, "wb").close() + + with patch.object(download_functions, "download_file", new=_capture): + await download_functions.download_from_url_multi_threaded( + destination=str(tmp_path), + presigned_url=presigned, + synapse_client=syn, + ) + + request = captured["request"] + assert request.presigned_url is presigned + assert request.file_handle_id is None + assert request.object_id is None + assert request.object_type is None diff --git a/tests/unit/synapseclient/core/unit_test_download.py b/tests/unit/synapseclient/core/unit_test_download.py index 72fdd8247..dde7169e0 100644 --- a/tests/unit/synapseclient/core/unit_test_download.py +++ b/tests/unit/synapseclient/core/unit_test_download.py @@ -20,6 +20,7 @@ from synapseclient.core.download import ( DownloadRequest, PresignedUrlInfo, + _pre_signed_url_expiration_time, download_by_file_handle, download_file, download_from_url, @@ -435,11 +436,23 @@ async def test_mock_download(syn: Synapse) -> None: class TestDownloadFileHandle: mock_file_handle = { "id": "123", + "fileName": "foo.txt", "concreteType": concrete_types.S3_FILE_HANDLE, "contentMd5": "someMD5", "contentSize": download_async.SYNAPSE_DEFAULT_DOWNLOAD_PART_SIZE + 1, } + # A syntactically valid AWS pre-signed URL that expires well in the future, so + # _pre_signed_url_expiration_time can parse it. + mock_presigned_url = ( + "https://s3.amazonaws.com/proddata/bucket/foo.txt" + "?X-Amz-Algorithm=AWS4-HMAC-SHA256" + "&X-Amz-Date=20200101T000000Z" + "&X-Amz-Expires=999999999" + "&X-Amz-SignedHeaders=host" + "&X-Amz-Signature=signature-value" + ) + @pytest_asyncio.fixture(autouse=True, loop_scope="function", scope="function") def init_syn(self, syn: Synapse) -> None: self.syn = syn @@ -463,6 +476,7 @@ async def test_multithread_true_s3_file_handle(self) -> None: ): mock_getFileHandleDownload.return_value = { "fileHandle": self.mock_file_handle, + "preSignedURL": self.mock_presigned_url, } self.syn.multi_threaded = True @@ -474,12 +488,22 @@ async def test_multithread_true_s3_file_handle(self) -> None: synapse_client=self.syn, ) + # The pre-signed URL already retrieved is forwarded to the multi-threaded + # downloader (so it is not fetched a second time), while the file handle + # identifiers are still passed so it can refetch the URL if it expires. mock_multi_thread_download.assert_called_once_with( file_handle_id=123, object_id=456, object_type="FileEntity", destination="/myfakepath", expected_md5="someMD5", + presigned_url=PresignedUrlInfo( + file_name="foo.txt", + url=self.mock_presigned_url, + expiration_utc=_pre_signed_url_expiration_time( + self.mock_presigned_url + ), + ), synapse_client=self.syn, )