From 278c1cc8638e2a5764e64b0665d4b8a7733c249f Mon Sep 17 00:00:00 2001 From: Noa Dove Date: Tue, 24 Feb 2026 19:22:05 -0800 Subject: [PATCH 1/2] Use requester-pays for mirroring in AnVIL deployments (#7794) --- environment.py | 6 ++++++ src/azul/__init__.py | 4 ++++ src/azul/drs.py | 28 +++++++++++++++++++--------- src/azul/indexer/mirror_service.py | 7 ++++++- 4 files changed, 35 insertions(+), 10 deletions(-) diff --git a/environment.py b/environment.py index 06195b3339..bb6e984afa 100644 --- a/environment.py +++ b/environment.py @@ -744,6 +744,12 @@ def env() -> Mapping[str, str | None]: # 'AZUL_TERRA_SERVICE_URL': None, + # The Google Project ID associated with the Terra workspace to charge + # for file downloads while mirroring. If left unset, Terra pays the + # egress cost for the downloads. + # + 'AZUL_TERRA_BILLING_PROJECT': None, + # OAuth2 Client ID to be used for authenticating users. See section # 3.2 of the README # diff --git a/src/azul/__init__.py b/src/azul/__init__.py index 32d71a5e31..5976cbc495 100644 --- a/src/azul/__init__.py +++ b/src/azul/__init__.py @@ -329,6 +329,10 @@ def terra_service_url(self) -> mutable_furl: def ecm_service_url(self) -> mutable_furl: return mutable_furl(self.environ['azul_ecm_service_url']) + @property + def terra_billing_project(self) -> str | None: + return self.environ.get('AZUL_TERRA_BILLING_PROJECT') + @property def dss_query_prefix(self) -> str: return self.environ.get('AZUL_DSS_QUERY_PREFIX', '') diff --git a/src/azul/drs.py b/src/azul/drs.py index f1cc29e87e..8d5e07d6a9 100644 --- a/src/azul/drs.py +++ b/src/azul/drs.py @@ -356,15 +356,21 @@ class DRSObject: _http_client: HttpClient _url: furl - def get(self, access_method: AccessMethod = AccessMethod.https) -> Access: + def get(self, + access_method: AccessMethod = AccessMethod.https, + access_headers: Mapping[str, str] | None = None + ) -> Access: """ Returns access to the content of the data object identified by the given URI. The scheme of the URL in the returned access object depends on the access method specified. """ - return self._get(access_method) + return self._get(access_method, access_headers) - def _get(self, access_method: AccessMethod) -> Access: + def _get(self, + access_method: AccessMethod, + access_headers: Mapping[str, str] | None + ) -> Access: url = self._url while True: response = self._request(url) @@ -384,9 +390,9 @@ def _get(self, access_method: AccessMethod) -> Access: # https://github.com/ga4gh/data-repository-service-schemas/issues/361 assert access_method is AccessMethod.gs, R( 'Unexpected access method', access_method) - return self._get_access(access_id, AccessMethod.https) + return self._get_access(access_id, AccessMethod.https, access_headers) elif access_id is not None: - return self._get_access(access_id, access_method) + return self._get_access(access_id, access_method, access_headers) elif access_url is not None: scheme = furl(access_url['url']).scheme assert scheme == access_method.scheme, R( @@ -403,11 +409,15 @@ def _get(self, access_method: AccessMethod) -> Access: else: raise DRSStatusException(url, response) - def _get_access(self, access_id: str, access_method: AccessMethod) -> Access: + def _get_access(self, + access_id: str, + access_method: AccessMethod, + access_headers: Mapping[str, str] | None + ) -> Access: url = self._url.copy() url.path.add(['access', access_id]) while True: - response = self._request(url) + response = self._request(url, headers=access_headers) if response.status == 200: response_data = json_dict(json.loads(response.data)) scheme = furl(json_str(response_data['url'])).scheme @@ -426,8 +436,8 @@ def _get_access(self, access_id: str, access_method: AccessMethod) -> Access: else: raise DRSStatusException(url, response) - def _request(self, url: furl) -> urllib3.BaseHTTPResponse: - return self._http_client.request('GET', str(url), redirect=False) + def _request(self, url: furl, **kwargs) -> urllib3.BaseHTTPResponse: + return self._http_client.request('GET', str(url), **kwargs, redirect=False) class DRSStatusException(Exception): diff --git a/src/azul/indexer/mirror_service.py b/src/azul/indexer/mirror_service.py index 32cbf69e02..0af1d9c79a 100644 --- a/src/azul/indexer/mirror_service.py +++ b/src/azul/indexer/mirror_service.py @@ -749,7 +749,12 @@ def _repository_url(self, file: File) -> furl: assert file.drs_uri is not None, R( 'File cannot be downloaded', file) object = self.repository_plugin.drs_object(file.drs_uri) - access = object.get(AccessMethod.gs) + billing_project = config.terra_billing_project + if billing_project is not None: + access_headers = {'x-user-project': billing_project} + else: + access_headers = None + access = object.get(AccessMethod.gs, access_headers) assert access.method is AccessMethod.https, access return furl(access.url) From 86488dff4fc3dad3ee16216b0948dfc5fafa19e9 Mon Sep 17 00:00:00 2001 From: Noa Dove Date: Wed, 15 Apr 2026 13:04:33 -0700 Subject: [PATCH 2/2] drop! Test requester-pays in dummy workspace --- deployments/anvilprod/environment.py | 1 + deployments/hammerbox/environment.py | 1 + scripts/scratch_7.py | 49 ++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+) create mode 100644 scripts/scratch_7.py diff --git a/deployments/anvilprod/environment.py b/deployments/anvilprod/environment.py index 054425bd6d..c0fb259b5a 100644 --- a/deployments/anvilprod/environment.py +++ b/deployments/anvilprod/environment.py @@ -1323,6 +1323,7 @@ def env() -> Mapping[str, str | None]: 'AZUL_SAM_SERVICE_URL': 'https://sam.dsde-prod.broadinstitute.org', 'AZUL_DUOS_SERVICE_URL': 'https://consent.dsde-prod.broadinstitute.org', 'AZUL_TERRA_SERVICE_URL': 'https://firecloud-orchestration.dsde-prod.broadinstitute.org', + 'AZUL_TERRA_BILLING_PROJECT': 'terra-aae33465', 'azul_ecm_service_url': 'https://externalcreds.dsde-prod.broadinstitute.org', 'AZUL_ENABLE_MONITORING': '1', diff --git a/deployments/hammerbox/environment.py b/deployments/hammerbox/environment.py index 71b24db322..f9428a05da 100644 --- a/deployments/hammerbox/environment.py +++ b/deployments/hammerbox/environment.py @@ -1337,6 +1337,7 @@ def env() -> Mapping[str, str | None]: 'AZUL_SAM_SERVICE_URL': 'https://sam.dsde-prod.broadinstitute.org', 'AZUL_DUOS_SERVICE_URL': 'https://consent.dsde-prod.broadinstitute.org', 'AZUL_TERRA_SERVICE_URL': 'https://firecloud-orchestration.dsde-prod.broadinstitute.org', + 'AZUL_TERRA_BILLING_PROJECT': 'terra-aae33465', 'azul_ecm_service_url': 'https://externalcreds.dsde-prod.broadinstitute.org', # Personal deployments & `hammerbox` share an ES domain with `anvilprod` diff --git a/scripts/scratch_7.py b/scripts/scratch_7.py new file mode 100644 index 0000000000..bf1d2f015d --- /dev/null +++ b/scripts/scratch_7.py @@ -0,0 +1,49 @@ +from azul import config +from azul.indexer.mirror_service import MirrorWorkerService +from azul.service import Filters +from azul.service.index_service import IndexService + + +def download(catalog, source_id, file_uuid): + index_service = IndexService() + mirror_service = MirrorWorkerService(catalog=catalog, schema_url_func=None) + file = index_service.get_data_file(catalog=catalog, + file_uuid=file_uuid, + file_version=None, + filters=Filters(explicit={}, + source_ids={source_id})) + assert file is not None + + data = mirror_service._download(file, part=None) + print('Downloaded', len(data), 'bytes') + + +sandbox_args = { + 'source_id': 'b1083e8b-4de9-467a-97de-18179c4e6bd1', + 'file_uuid': '60e25442-aba0-4934-af42-be0d536112de' +} + +hammerbox_args = { + 'source_id': 'b3b5fbcb-583d-4894-90bc-19abe85a0f4f', + 'file_uuid': '5a795c00-3df1-468d-b4a0-2e7fe048b6d4' +} + + +def main(): + deployment = config.deployment.name + match deployment: + case 'sandbox': + args = sandbox_args + case 'hammerbox': + args = hammerbox_args + case _: + assert False, deployment + + download( + catalog=config.default_catalog, + **args + ) + + +if __name__ == '__main__': + main()